2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 01:51:26 +00:00

ipsec: libreswan: Fix premature reconciliation of just added tunnels.

Currently we're only tracking the last refresh time and perform
reconciliation of non-active connections on every refresh.  This is
causing issues in large clusters when tunnels are added sequentially.
Consider the following example:

 1. Tun-1 added -> refresh()
    -> Tun-1: adding 'in' and starting 'out'.

 2. Tun-2 added -> refresh()
    -> Tun-2: adding 'in' and starting 'out'.
    -> Tun-1: The other side didn't have time to initiate the 'in'
              connection yet, so it is not active.  But we see that
              it's not active and trying to start it.

 3. Tun-3 added -> refresh()
    -> Tun-3: adding 'in' and starting 'out'.
    -> Tun-2: The other side didn't have time to initiate the 'in'
              connection yet, so it is not active.  But we see that
              it's not active and trying to start it.
    -> Tun-1: The connection still had no time to become active, but
              we declare it 'defunct' and re-creating.

Behavior above is specific to Libreswan 4.  Libreswan 5 will report
UP connections as active in most cases, so they will not be marked
as defunct, but they will still be started quickly after addition
when it is not needed.

This creates unnecessary churn in the cluster and puts Libreswan into
an uncomfortable position where crossing stream issues (where both
sides are trying to establish the same connection at the same time)
are far more likely.

Fix that by specifically tracking time when we add or start each
connection instead of just the last time we refreshed for any reason.
This should make ovs-monitor-ipsec to actually wait for the
reconciliation interval before attempting to repair connections and
give Libreswan a decent amount of time to process the changes and try
to establish connections normally.

Note: even though we could precisely track 15 seconds for each
individual connection and wake up when exactly 15 seconds expire,
we're not doing that in this patch.  The reason is that we still
need to wake up every 15 seconds to check that all the previously
active connections are still active, and doing that allows for
refreshing many connections in the same run instead of waking up
every second just for one connection.

Fixes: 25a301822e0d ("ipsec: libreswan: Reconcile missing connections periodically.")
Reported-at: https://issues.redhat.com/browse/FDP-1364
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
This commit is contained in:
Ilya Maximets 2025-05-07 18:26:49 +02:00
parent 1b2eff486f
commit 6fc5221742
2 changed files with 163 additions and 25 deletions

View File

@ -514,8 +514,7 @@ conn prevent_unencrypted_vxlan
self.IPSEC_D = "sql:" + libreswan_root_prefix + ipsec_d
self.IPSEC_CTL = libreswan_root_prefix + ipsec_ctl
self.conf_file = None
self.conns_not_active = set()
self.last_refresh = time.time()
self.conns_not_active = {"added": {}, "started": {}}
self.secrets_file = None
self.use_default_conn = self.IPSEC_CONF == self.ROOT_IPSEC_CONF
self.use_default_crypto = args.use_default_crypto
@ -642,6 +641,9 @@ conn prevent_unencrypted_vxlan
name = self.CERT_PREFIX + tunnel.conf["remote_name"]
self._nss_delete_cert(name)
def conn_should_reconcile(self, timestamp):
return time.time() >= timestamp + RECONCILIATION_INTERVAL
def refresh(self, monitor):
vlog.info("Refreshing LibreSwan configuration")
run_command(self.IPSEC_AUTO + ["--ctlsocket", self.IPSEC_CTL,
@ -662,12 +664,18 @@ conn prevent_unencrypted_vxlan
active = set(active_conns.get(name, dict()).keys())
# Untrack connections that became active.
self.conns_not_active.difference_update(active)
for conn in active:
self.conns_not_active["added"].pop(conn, None)
self.conns_not_active["started"].pop(conn, None)
# Remove connections that didn't become active after --start
# and another explicit --up.
for conn in self.conns_not_active & loaded:
self._delete_ipsec_connection(conn, "is defunct")
loaded.remove(conn)
# or an explicit --up.
for conn in list(loaded):
started = self.conns_not_active["started"].get(conn, None)
if started and self.conn_should_reconcile(started):
self._delete_ipsec_connection(conn, "is defunct")
loaded.remove(conn)
# Remove all the loaded or active but not desired connections.
for conn in loaded | active:
@ -706,10 +714,11 @@ conn prevent_unencrypted_vxlan
# desired == loaded and desired >= loaded + active,
# so loaded >= active
for conn in loaded - active:
vlog.info("Bringing up ipsec connection %s" % conn)
# On failure to --up it will be removed from the set.
self.conns_not_active.add(conn)
self._start_ipsec_connection(conn, "up")
added = self.conns_not_active["added"].get(conn, None)
if added and self.conn_should_reconcile(added):
vlog.info("Bringing up ipsec connection %s" % conn)
self._start_ipsec_connection(conn, "up")
# Update shunt policy if changed
if monitor.conf_in_use["skb_mark"] != monitor.conf["skb_mark"]:
@ -746,7 +755,13 @@ conn prevent_unencrypted_vxlan
"--delete",
"--asynchronous", "prevent_unencrypted_vxlan"])
monitor.conf_in_use["skb_mark"] = monitor.conf["skb_mark"]
self.last_refresh = time.time()
if monitor.tunnels and \
not self.conns_not_active["added"] and \
not self.conns_not_active["started"]:
vlog.info("Connections for all(%d) configured tunnels are Up."
% len(monitor.tunnels))
vlog.info("Refreshing is done.")
def get_conns_from_status(self, pattern):
@ -809,31 +824,51 @@ conn prevent_unencrypted_vxlan
return conns
def need_to_reconcile(self, monitor):
if time.time() - self.last_refresh < RECONCILIATION_INTERVAL:
return False
timestamps = (
timestamp for group in ("added", "started")
for timestamp in self.conns_not_active[group].values()
)
earliest = min(timestamps, default=None)
if earliest and self.conn_should_reconcile(earliest):
vlog.dbg("Need to check the state of the following connections: %s"
% self.conns_not_active)
return True
conns_dict = self.get_active_conns()
for ifname, tunnel in monitor.tunnels.items():
if ifname not in conns_dict:
vlog.info("Connection for port %s is not active, "
"need to reconcile" % ifname)
return True
existing_conns = conns_dict.get(ifname)
existing_conns = set(conns_dict.get(ifname, dict()).keys())
desired_conns = self.get_conn_names(monitor, ifname)
if set(existing_conns.keys()) != set(desired_conns):
for conn in existing_conns.symmetric_difference(desired_conns):
timestamp = None
for group in ("added", "started"):
timestamp = self.conns_not_active[group].get(conn, None)
if timestamp:
break
# If connection is on one of the 'not_active' lists, it should
# not be reconciled, unless sufficient time have passed.
#
# If it's not on any of the lists, it means that one of the
# previously active connections somehow became non-active, and
# we need to reconcile.
if timestamp and not self.conn_should_reconcile(timestamp):
continue
vlog.info("Active connections for port %s %s do not match "
"desired %s, need to reconcile"
% (ifname, list(existing_conns.keys()),
desired_conns))
% (ifname, list(existing_conns), desired_conns))
return True
return False
def _delete_ipsec_connection(self, conn, reason):
vlog.info("%s %s, removing" % (conn, reason))
self.conns_not_active.discard(conn)
self.conns_not_active["added"].pop(conn, None)
self.conns_not_active["started"].pop(conn, None)
run_command(self.IPSEC_AUTO +
["--ctlsocket", self.IPSEC_CTL,
"--config", self.ROOT_IPSEC_CONF,
@ -858,6 +893,15 @@ conn prevent_unencrypted_vxlan
# We don't know in which state the connection was left on
# failure. Try to clean it up.
self._delete_ipsec_connection(conn, "--%s failed" % action)
else:
# Update timestamps, make sure the connection is only tracked
# in one of the 'not_active' lists.
if action == "add":
self.conns_not_active["added"][conn] = time.time()
self.conns_not_active["started"].pop(conn, None)
else:
self.conns_not_active["added"].pop(conn, None)
self.conns_not_active["started"][conn] = time.time()
def _nss_clear_database(self):
"""Remove all OVS IPsec related state from the NSS database"""

View File

@ -535,6 +535,96 @@ AT_CHECK([grep -q -E "(ike|ikev2|esp)=" $ovs_base/right/custom.conf], [1])
OVS_TRAFFIC_VSWITCHD_STOP()
AT_CLEANUP
AT_SETUP([IPsec -- Libreswan - reconciliation interval is respected - ipv4])
AT_KEYWORDS([ipsec libreswan ipv4 geneve psk reconciliation])
dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645
dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988
CHECK_LIBRESWAN()
OVS_TRAFFIC_VSWITCHD_START()
IPSEC_SETUP_UNDERLAY()
m4_define([PSK_OPTIONS], [options:remote_ip=$1 options:psk=swordfish])
dnl Set up only the left host.
IPSEC_ADD_NODE_LEFT([10.1.1.1], [10.1.1.254])
IPSEC_ADD_TUNNEL_LEFT([geneve], PSK_OPTIONS([10.1.1.2]))
dnl Wait for the monitor to find the new connection.
OVS_WAIT_UNTIL([grep -q 'tun appeared' left/ovs-monitor-ipsec.log])
dnl Add a few more tunels waiting for the monitor to find them too.
m4_for([id], [1], 5, [1], [
OVS_VSCTL([left], add-port br-ipsec tun-id \
-- set Interface tun-id type=geneve PSK_OPTIONS(10.1.2.id))
OVS_WAIT_UNTIL(grep -q 'tun-id appeared' left/ovs-monitor-ipsec.log)
])
dnl And now remove all the extra tunnels.
m4_for([id], [1], 5, [1], [
OVS_VSCTL([left], del-port tun-id)
OVS_WAIT_UNTIL(grep -q 'tun-id disappeared' left/ovs-monitor-ipsec.log)
])
dnl Check that none of the connections were marked for reconciliation yet.
dnl It should take at least 15 seconds for this to happen.
AT_CHECK([grep -E 'half-loaded|defunct' left/ovs-monitor-ipsec.log], [1])
AT_CHECK([grep 'Bringing up' left/ovs-monitor-ipsec.log], [1])
dnl But we should eventually attempt to reconcile the original 'tun' tunnel.
OVS_WAIT_UNTIL([grep -qE 'Bringing up ipsec connection tun-[[io]]' \
left/ovs-monitor-ipsec.log])
dnl Now add the right host and check that connection works properly.
IPSEC_ADD_NODE_RIGHT([10.1.1.2], [10.1.1.254])
IPSEC_ADD_TUNNEL_RIGHT([geneve], PSK_OPTIONS([10.1.1.1]))
CHECK_ESP_TRAFFIC
OVS_TRAFFIC_VSWITCHD_STOP()
AT_CLEANUP
AT_SETUP([IPsec -- Libreswan - reconciliation interval is respected - ipv6])
AT_KEYWORDS([ipsec libreswan ipv6 geneve psk reconciliation])
dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645
dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988
CHECK_LIBRESWAN()
OVS_TRAFFIC_VSWITCHD_START()
IPSEC_SETUP_UNDERLAY()
m4_define([PSK_OPTIONS], [options:remote_ip=$1 options:psk=swordfish])
dnl Set up only the left host.
IPSEC_ADD_NODE_LEFT([fd01::101], [fd01::254])
IPSEC_ADD_TUNNEL_LEFT([geneve], PSK_OPTIONS([fd01::102]))
dnl Wait for the monitor to find the new connection.
OVS_WAIT_UNTIL([grep -q 'tun appeared' left/ovs-monitor-ipsec.log])
dnl Add a few more tunels waiting for the monitor to find them too.
m4_for([id], [1], 5, [1], [
OVS_VSCTL([left], add-port br-ipsec tun-id \
-- set Interface tun-id type=geneve PSK_OPTIONS(fd02::id))
OVS_WAIT_UNTIL(grep -q 'tun-id appeared' left/ovs-monitor-ipsec.log)
])
dnl And now remove all the extra tunnels.
m4_for([id], [1], 5, [1], [
OVS_VSCTL([left], del-port tun-id)
OVS_WAIT_UNTIL(grep -q 'tun-id disappeared' left/ovs-monitor-ipsec.log)
])
dnl Check that none of the connections were marked for reconciliation yet.
dnl It should take at least 15 seconds for this to happen.
AT_CHECK([grep -E 'half-loaded|defunct' left/ovs-monitor-ipsec.log], [1])
AT_CHECK([grep 'Bringing up' left/ovs-monitor-ipsec.log], [1])
dnl But we should eventually attempt to reconcile the original 'tun' tunnel.
OVS_WAIT_UNTIL([grep -qE 'Bringing up ipsec connection tun-[[io]]' \
left/ovs-monitor-ipsec.log])
dnl Now add the right host and check that connection works properly.
IPSEC_ADD_NODE_RIGHT([fd01::102], [fd01::254])
IPSEC_ADD_TUNNEL_RIGHT([geneve], PSK_OPTIONS([fd01::101]))
CHECK_ESP_TRAFFIC
OVS_TRAFFIC_VSWITCHD_STOP()
AT_CLEANUP
AT_SETUP([IPsec -- Libreswan - established conns survive new additions - ipv4])
AT_KEYWORDS([ipsec libreswan ipv4 geneve psk persistence])
dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645
@ -752,6 +842,10 @@ if test -s stderr; then
auto=
fi
dnl Wait for the monitor to be done with all the connections.
OVS_WAIT_UNTIL([grep -q 'all(19) configured tunnels are Up' \
$ovs_base/node-1/ovs-monitor-ipsec.log])
dnl Remove connections for two tunnels. One fully and one partially.
AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \
--config $ovs_base/node-1/ipsec.conf \
@ -764,7 +858,7 @@ AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \
--delete tun-2-out-1], [0], [stdout])
dnl Wait for the monitor to notice the missing connections.
OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \
OVS_WAIT_UNTIL([grep -qE 'tun-[[25]] .*need to reconcile' \
$ovs_base/node-1/ovs-monitor-ipsec.log])
dnl Wait for all the connections to be loaded back.