ovsdb: raft: Fix cluster break down on leaving with some nodes down.

Consider a following scenario in a 3-node cluster: 1. node-1 goes down for some reason without leaving the cluster. 2. Some changes are committed to the log. 3. node-2 requests to leave the cluster with the cluster/leave. 4. node-3 also requests to leave the cluster. 5. node-1 comes back online. In this scenario today the cluster breaks and doesn't recover. The reason is that both node-2 and node-3 are in the 'leaving' state, so they do not initiate elections. node-1 is behind on log updates and can't become a leader. But there is no leader to catch it up. In order for the cluster to recover in this situation, one of the leaving servers must become a leader again, then catch up node-1 with the log updates and then node-1 can become a new leader, process both server removal requests and become an operational cluster of 1. The 'leaving' state is not a real state of the server in RAFT and the server should still participate in the cluster until the removal is committed by the majority of the NEW configuration. This can be achieved by allowing the server in a 'leaving' state to initiate elections and become a leader again this way. Becoming a leader also means that this server will need to be able to execute commands and do all the normal tasks of a leader. Since the leader can't leave the cluster though, this server will need to attempt to transfer leadership again in order to actually leave. This should be done after a leave timeout. The time between becoming a leader and transferring the leadership will allow node-1 to get up to speed with all the cluster updates and be ready to become a new leader. Sending a server removal request right after transferring leadership can't succeed, because the other server has to go through election in order to become a leader before it can process a removal request. So, adding a delay between the transfer and the removal request. It can be lower than the election timer, just to avoid waiting for too long if the election timer has a large value. But it should be randomized, so multiple leaving servers do not just bounce the leadership without actually getting to the server removal request. A test is added to stress different scenarios with servers leaving while some of the cluster members are down. Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") Reported-at: https://issues.redhat.com/browse/FDP-662 Tested-by: Terry Wilson <twilson@redhat.com> Acked-by: Terry Wilson <twilson@redhat.com> Acked-by: Mike Pattrick <mkp@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-08-22 01:51:26 +00:00 · 2025-03-27 23:10:27 +01:00 · 2025-03-27 23:10:27 +01:00 · 9d15da43ba
commit 9d15da43ba
parent 27315463b5
2 changed files with 200 additions and 11 deletions
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@ -1377,8 +1377,29 @@ raft_send_remove_server_requests(struct raft *raft)
            raft_send(raft, &rpc);
        }
    }
+}

-    raft->leave_timeout = time_msec() + raft->election_timer;
+/* Sends requests required to leave the cluster and schedules the next time
+ * this function should be called. */
+static void
+raft_send_leave_requests(struct raft *raft)
+{
+    long long int delay = raft->election_timer;
+
+    if (raft->role == RAFT_LEADER) {
+        raft_transfer_leadership(raft, "this server is leaving the cluster");
+        raft_become_follower(raft);
+        /* Not sending the RemoveServerRequest right away, because a new
+         * leader has to be elected first for the request to be successful.
+         * But setting a shorter delay to avoid waiting for too long when
+         * the leader re-election is fast.  Randomized to avoid two servers
+         * bouncing the leadership between each other and never actually
+         * leaving. */
+        delay = delay / 10 + random_range(delay / 10);
+    } else {
+        raft_send_remove_server_requests(raft);
+    }
+    raft->leave_timeout = time_msec() + delay;
 }

 /* Attempts to start 'raft' leaving its cluster.  The caller can check progress
@ -1392,10 +1413,7 @@ raft_leave(struct raft *raft)
    VLOG_INFO(SID_FMT": starting to leave cluster "CID_FMT,
              SID_ARGS(&raft->sid), CID_ARGS(&raft->cid));
    raft->leaving = true;
-    raft_transfer_leadership(raft, "this server is leaving the cluster");
-    raft_become_follower(raft);
-    raft_send_remove_server_requests(raft);
-    raft->leave_timeout = time_msec() + raft->election_timer;
+    raft_send_leave_requests(raft);
 }

 /* Returns true if 'raft' is currently attempting to leave its cluster. */
@ -1867,10 +1885,6 @@ raft_start_election(struct raft *raft, bool is_prevote,
    /* Leadership transfer doesn't use pre-vote. */
    ovs_assert(!is_prevote || !leadership_transfer);

-    if (raft->leaving) {
-        return;
-    }
-
    struct raft_server *me = raft_find_server(raft, &raft->sid);
    if (!me) {
        return;
@ -1997,6 +2011,12 @@ raft_conn_should_stay_open(struct raft *raft, struct raft_conn *conn)
        return true;
    }

+    /* Keep the connection until we send a RemoveServerReply. */
+    if (raft->remove_server
+        && uuid_equals(&conn->sid, &raft->remove_server->sid)) {
+        return true;
+    }
+
    /* We have joined the cluster.  If we did that "recently", then there is a
     * chance that we do not have the most recent server configuration log
     * entry.  If so, it's a waste to disconnect from the servers that were in
@ -2123,6 +2143,8 @@ raft_run(struct raft *raft)
                    count ++;
                }
            }
+            VLOG_DBG("%d out of %"PRIuSIZE" servers replied",
+                      count, hmap_count(&raft->servers));
            if (count >= hmap_count(&raft->servers) / 2) {
                HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
                    server->replied = false;
@ -2139,7 +2161,7 @@ raft_run(struct raft *raft)
    }

    if (raft->leaving && time_msec() >= raft->leave_timeout) {
-        raft_send_remove_server_requests(raft);
+        raft_send_leave_requests(raft);
    }

    if (raft->joining && time_msec() >= raft->join_timeout) {
@ -2452,7 +2474,7 @@ raft_command_execute__(struct raft *raft, const struct json *data,
                       const struct json *servers, uint64_t election_timer,
                       const struct uuid *prereq, struct uuid *result)
 {
-    if (raft->joining || raft->leaving || raft->left || raft->failed) {
+    if (raft->joining || raft->left || raft->failed) {
        return raft_command_create_completed(RAFT_CMD_SHUTDOWN);
    }

@ -4175,6 +4197,14 @@ raft_handle_remove_server_request(struct raft *raft,
        return;
    }

+    /* Check for the server already being removed. */
+    if (raft->remove_server
+        && uuid_equals(&rq->sid, &raft->remove_server->sid)) {
+        raft_send_remove_server_reply(raft, rq,
+                                      false, RAFT_SERVER_IN_PROGRESS);
+        return;
+    }
+
    /* If the server isn't configured, report that. */
    target = raft_find_server(raft, &rq->sid);
    if (!target) {
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@ -578,6 +578,165 @@ for i in $(seq $n); do
    OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
 done

+AT_CLEANUP
+
+AT_BANNER([OVSDB - cluster failure while leaving])
+AT_SETUP([OVSDB cluster - leaving the cluster with some servers down])
+AT_KEYWORDS([ovsdb server negative unix cluster leave])
+
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db \
+            $top_srcdir/vswitchd/vswitch.ovsschema unix:s1.raft], [0], [], [stderr])
+schema_name=$(ovsdb-tool schema-name $top_srcdir/vswitchd/vswitch.ovsschema)
+for i in 2 3 4 5; do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill $(cat *.pid)'
+on_exit "
+  for i in \$(ls $(pwd)/s[[0-5]]); do
+    ovs-appctl --timeout 1 -t \$i cluster/status $schema_name;
+  done
+"
+dnl Starting all the servers.
+for i in 1 2 3 4 5; do
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+                           --detach --no-chdir --log-file=s$i.log \
+                           --pidfile=s$i.pid --unixctl=s$i \
+                           --remote=punix:s$i.ovsdb s$i.db])
+done
+
+dnl Make sure that all servers joined the cluster.
+for i in 1 2 3 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+dnl Make sure the cluster is operational.
+m4_define([DB_REMOTE], [unix:s1.ovsdb,unix:s2.ovsdb,unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait init])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" -vovsdb_cs:console:dbg --no-leader-only \
+            --no-wait create QoS type=test-1], [0], [ignore], [ignore])
+
+dnl Stop servers 1 and 2.
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid])
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid])
+
+dnl Make sure that all remaining servers are functional as a cluster.
+for i in 3 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+dnl Make sure the cluster is still operational.
+m4_define([DB_REMOTE], [unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" -vovsdb_cs:console:dbg --no-leader-only \
+            --no-wait create QoS type=test-2], [0], [ignore], [ignore])
+
+dnl Servers 1 and 2 in a cluster of 5 are down, 3 servers are still alive.
+dnl Server 3 can't leave, because the NEW configuration will be a cluster of
+dnl 4 with 2 servers down and it doesn't have a quorum.  Try it.
+dnl The cluster will fall apart until servers 1 or 2 come back to resolve
+dnl the quorum issue, because servers 4 and 5 will no longer consider 3
+dnl to be part of the configuration.
+AT_CHECK([ovs-appctl -t $(pwd)/s3 cluster/leave $schema_name])
+
+dnl Check that the cluster is not operational.
+for i in 3 4 5; do
+    OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \
+                        | grep -qE 'leaving|disconnected'])
+done
+
+dnl Try to commit a transaction, it should not be successful.
+m4_define([DB_REMOTE], [unix:s3.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" -vovsdb_cs:console:dbg --no-leader-only \
+            --no-wait create QoS type=test-3], [1], [ignore], [stderr])
+
+dnl Now bring back the server 2.  This should allow server 3 to leave.
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+                          --detach --no-chdir --log-file=s2.log \
+                          --pidfile=s2.pid --unixctl=s2 \
+                          --remote=punix:s2.ovsdb s2.db])
+
+dnl Wait for server 3 to actually leave and stop the server.
+AT_CHECK([ovsdb_client_wait unix:s3.ovsdb $schema_name removed])
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s3], [s3.pid])
+
+dnl Make sure that all remaining servers are functional as a cluster.
+for i in 2 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+dnl Make sure the cluster is operational again.
+m4_define([DB_REMOTE], [unix:s2.ovsdb,unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" -vovsdb_cs:console:dbg --no-leader-only \
+            --no-wait create QoS type=test-4], [0], [ignore], [ignore])
+
+dnl Now we have a cluster of 4 servers (1, 2, 4, 5) with 1 server down.
+dnl Server 2 should be able to leave, because the NEW configuration will
+dnl be a cluster of 3 servers with 1 being down and it has a quorum.
+AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/leave $schema_name])
+dnl Wait for server 2 to actually leave and stop the server.
+AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name removed])
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s2], [s2.pid])
+
+dnl Make sure the cluster is still operational.
+m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" -vovsdb_cs:console:dbg --no-leader-only \
+            --no-wait create QoS type=test-5], [0], [ignore], [ignore])
+
+dnl Now we have a cluster of 3 servers (1, 4, 5) with 1 server down.
+dnl None of the alive servers can leave, because the NEW configuration
+dnl will be a cluster of 2 with 1 server down and it has no quorum.
+dnl Request both to leave anyway.
+for i in 4 5; do
+    AT_CHECK([ovs-appctl -t $(pwd)/s$i cluster/leave $schema_name])
+done
+
+dnl Check that the cluster is not operational.
+for i in 4 5; do
+    OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name \
+                        | grep -qE 'leaving|disconnected'])
+done
+
+dnl Try to commit a transaction, it should not be successful.
+m4_define([DB_REMOTE], [unix:s4.ovsdb,unix:s5.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" -vovsdb_cs:console:dbg --no-leader-only \
+            --no-wait create QoS type=test-6], [1], [ignore], [stderr])
+
+dnl Now bring back the first server.
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+                          --detach --no-chdir --log-file=s1.log \
+                          --pidfile=s1.pid --unixctl=s1 \
+                          --remote=punix:s1.ovsdb s1.db])
+
+dnl Now it should be possible for all the other servers to leave, so we
+dnl should end up with a single-node cluster that consists of server 1.
+for i in 4 5; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name removed])
+done
+for i in 4 5; do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
+done
+
+dnl Wait for the first server to become a leader of a single-node cluster.
+OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s1 cluster/status $schema_name \
+                    | grep -q 'Role: leader'])
+AT_CHECK([ovs-appctl -t $(pwd)/s1 cluster/status $schema_name \
+            | grep -c '    s[[1-5]] '], [0], [dnl
+1
+])
+
+dnl Check that the database is operational and the data is still in there.
+m4_define([DB_REMOTE], [unix:s1.ovsdb])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" -vovsdb_cs:console:dbg --no-wait \
+            create QoS type=test-7], [0], [ignore], [ignore])
+AT_CHECK([ovs-vsctl --db="DB_REMOTE" --no-wait \
+            --columns=type --bare list QoS | sed '/^$/d' | sort], [0], [dnl
+test-1
+test-2
+test-4
+test-5
+test-7
+])
+
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s1], [s1.pid])
 AT_CLEANUP