2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 18:07:40 +00:00
ovs/ovsdb/trigger.c

465 lines
16 KiB
C
Raw Normal View History

/* Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
2009-11-04 15:11:44 -08:00
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "trigger.h"
#include <limits.h>
#include <string.h>
2009-11-04 15:11:44 -08:00
#include "cooperative-multitasking.h"
#include "file.h"
#include "openvswitch/json.h"
2009-11-04 15:11:44 -08:00
#include "jsonrpc.h"
#include "ovsdb.h"
#include "ovsdb-error.h"
#include "openvswitch/poll-loop.h"
#include "server.h"
#include "transaction.h"
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
#include "transaction-forward.h"
#include "openvswitch/vlog.h"
#include "util.h"
#include "uuid.h"
2009-11-04 15:11:44 -08:00
VLOG_DEFINE_THIS_MODULE(trigger);
static bool ovsdb_trigger_try(struct ovsdb_trigger *, long long int now);
static void ovsdb_trigger_complete(struct ovsdb_trigger *);
static void trigger_convert_error(struct ovsdb_trigger *,
struct ovsdb_error *);
static void trigger_success(struct ovsdb_trigger *, struct json *result);
2009-11-04 15:11:44 -08:00
bool
ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db,
struct ovsdb_trigger *trigger,
struct jsonrpc_msg *request, long long int now,
bool read_only, const char *role, const char *id)
2009-11-04 15:11:44 -08:00
{
ovs_assert(!strcmp(request->method, "transact") ||
!strcmp(request->method, "convert"));
trigger->session = session;
trigger->db = db;
ovs_list_push_back(&trigger->db->triggers, &trigger->node);
2009-11-04 15:11:44 -08:00
trigger->request = request;
trigger->converted_db = NULL;
trigger->reply = NULL;
trigger->progress = NULL;
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
trigger->txn_forward = NULL;
2009-11-04 15:11:44 -08:00
trigger->created = now;
trigger->timeout_msec = LLONG_MAX;
trigger->read_only = read_only;
trigger->role = nullable_xstrdup(role);
trigger->id = nullable_xstrdup(id);
return ovsdb_trigger_try(trigger, now);
2009-11-04 15:11:44 -08:00
}
void
ovsdb_trigger_destroy(struct ovsdb_trigger *trigger)
{
ovsdb_txn_progress_destroy(trigger->progress);
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
ovsdb_txn_forward_destroy(trigger->db, trigger->txn_forward);
ovs_list_remove(&trigger->node);
ovsdb_destroy(trigger->converted_db);
jsonrpc_msg_destroy(trigger->request);
jsonrpc_msg_destroy(trigger->reply);
free(trigger->role);
free(trigger->id);
2009-11-04 15:11:44 -08:00
}
bool
ovsdb_trigger_is_complete(const struct ovsdb_trigger *trigger)
{
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
return trigger->reply && !trigger->progress && !trigger->txn_forward;
2009-11-04 15:11:44 -08:00
}
struct jsonrpc_msg *
ovsdb_trigger_steal_reply(struct ovsdb_trigger *trigger)
2009-11-04 15:11:44 -08:00
{
struct jsonrpc_msg *reply = trigger->reply;
trigger->reply = NULL;
return reply;
2009-11-04 15:11:44 -08:00
}
/* Cancels 'trigger'. 'reason' should be a human-readable reason for log
* messages etc. */
2009-11-04 15:11:44 -08:00
void
ovsdb_trigger_cancel(struct ovsdb_trigger *trigger, const char *reason)
{
if (trigger->progress) {
/* The transaction still might complete asynchronously, but we can stop
* tracking it. */
ovsdb_txn_progress_destroy(trigger->progress);
trigger->progress = NULL;
}
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
if (trigger->txn_forward) {
ovsdb_txn_forward_destroy(trigger->db, trigger->txn_forward);
trigger->txn_forward = NULL;
}
jsonrpc_msg_destroy(trigger->reply);
trigger->reply = NULL;
if (!strcmp(trigger->request->method, "transact")) {
/* There's no place to stick 'reason' into the error reply because RFC
* 7047 prescribes a fix form for these messages, see section 4.1.4. */
trigger->reply = jsonrpc_create_error(json_string_create("canceled"),
trigger->request->id);
ovsdb_trigger_complete(trigger);
} else if (!strcmp(trigger->request->method, "convert")) {
trigger_convert_error(
trigger,
ovsdb_error("canceled", "database conversion canceled because %s",
reason));
}
}
void
ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger)
{
if (!ovsdb_trigger_is_complete(trigger)) {
if (!strcmp(trigger->request->method, "transact")) {
ovsdb_trigger_cancel(trigger, "database schema is changing");
} else if (!strcmp(trigger->request->method, "convert")) {
/* We don't cancel "convert" requests when a database is being
* replaced for two reasons. First, we expect the administrator to
* do some kind of sensible synchronization on conversion requests,
* that is, it only really makes sense for the admin to do a single
* conversion at a time at a scheduled point. Second, if we did
* then every "convert" request would end up getting canceled since
* "convert" itself causes the database to be replaced. */
} else {
OVS_NOT_REACHED();
}
}
}
/* Find among incomplete triggers one that caused database conversion
* with specified transaction ID. */
struct ovsdb *
ovsdb_trigger_find_and_steal_converted_db(const struct ovsdb *db,
const struct uuid *txnid)
{
struct ovsdb *converted_db = NULL;
struct ovsdb_trigger *t;
if (uuid_is_zero(txnid)) {
return NULL;
}
LIST_FOR_EACH_SAFE (t, node, &db->triggers) {
if (t->db == db && t->converted_db
&& uuid_equals(&t->conversion_txnid, txnid)) {
converted_db = t->converted_db;
t->converted_db = NULL;
break;
}
}
return converted_db;
}
bool
2009-11-04 15:11:44 -08:00
ovsdb_trigger_run(struct ovsdb *db, long long int now)
{
struct ovsdb_trigger *t;
2009-11-04 15:11:44 -08:00
bool run_triggers = db->run_triggers;
raft: Avoid busy loop during leader election. When a server doesn't see a leader yet, e.g. during leader re-election, if a transaction comes from a client, it will cause 100% CPU busy loop. With debug log enabled it is like: 2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 ... The problem is that in ovsdb_trigger_try(), all cluster errors are treated as temporary error and retry immediately. This patch fixes it by introducing 'run_triggers_now', which tells if a retry is needed immediately. When the cluster error is with detail 'not leader', we don't immediately retry, but will wait for the next poll event to trigger the retry. When 'not leader' status changes, there must be a event, i.e. raft RPC that changes the status, so the trigger is guaranteed to be triggered, without busy loop. Signed-off-by: Han Zhou <hzhou@ovn.org> Signed-off-by: Ben Pfaff <blp@ovn.org>
2020-02-28 18:07:07 -08:00
db->run_triggers_now = db->run_triggers = false;
bool disconnect_all = false;
LIST_FOR_EACH_SAFE (t, node, &db->triggers) {
cooperative_multitasking_yield();
if (run_triggers
|| now - t->created >= t->timeout_msec
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
|| t->progress || t->txn_forward) {
if (ovsdb_trigger_try(t, now)) {
disconnect_all = true;
}
2009-11-04 15:11:44 -08:00
}
}
return disconnect_all;
2009-11-04 15:11:44 -08:00
}
void
ovsdb_trigger_wait(struct ovsdb *db, long long int now)
{
raft: Avoid busy loop during leader election. When a server doesn't see a leader yet, e.g. during leader re-election, if a transaction comes from a client, it will cause 100% CPU busy loop. With debug log enabled it is like: 2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 ... The problem is that in ovsdb_trigger_try(), all cluster errors are treated as temporary error and retry immediately. This patch fixes it by introducing 'run_triggers_now', which tells if a retry is needed immediately. When the cluster error is with detail 'not leader', we don't immediately retry, but will wait for the next poll event to trigger the retry. When 'not leader' status changes, there must be a event, i.e. raft RPC that changes the status, so the trigger is guaranteed to be triggered, without busy loop. Signed-off-by: Han Zhou <hzhou@ovn.org> Signed-off-by: Ben Pfaff <blp@ovn.org>
2020-02-28 18:07:07 -08:00
if (db->run_triggers_now) {
2009-11-04 15:11:44 -08:00
poll_immediate_wake();
} else {
long long int deadline = LLONG_MAX;
struct ovsdb_trigger *t;
LIST_FOR_EACH (t, node, &db->triggers) {
2009-11-04 15:11:44 -08:00
if (t->created < LLONG_MAX - t->timeout_msec) {
long long int t_deadline = t->created + t->timeout_msec;
if (deadline > t_deadline) {
deadline = t_deadline;
if (now >= deadline) {
break;
}
}
}
}
if (deadline < LLONG_MAX) {
poll_timer_wait_until(deadline);
2009-11-04 15:11:44 -08:00
}
}
}
static bool
ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now)
2009-11-04 15:11:44 -08:00
{
/* Handle "initialized" state. */
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
if (!t->reply && !t->txn_forward) {
ovs_assert(!t->progress);
struct ovsdb_txn *txn = NULL;
if (!strcmp(t->request->method, "transact")) {
ovsdb raft: Precheck prereq before proposing commit. In current OVSDB Raft design, when there are multiple transactions pending, either from same server node or different nodes in the cluster, only the first one can be successful at once, and following ones will fail at the prerequisite check on leader node, because the first one will update the expected prerequisite eid on leader node, and the prerequisite used for proposing a commit has to be committed eid, so it is not possible for a node to use the latest prerequisite expected by the leader to propose a commit until the lastest transaction is committed by the leader and updated the committed_index on the node. Current implementation proposes the commit as soon as the transaction is requested by the client, which results in continously retry which causes high CPU load and waste. Particularly, even if all clients are using leader_only to connect to only the leader, the prereq check failure still happens a lot when a batch of transactions are pending on the leader node - the leader node proposes a batch of commits using the same committed eid as prerequisite and it updates the expected prereq as soon as the first one is in progress, but it needs time to append to followers and wait until majority replies to update the committed_index, which results in continously useless retries of the following transactions proposed by the leader itself. This patch doesn't change the design but simplely pre-checks if current eid is same as prereq, before proposing the commit, to avoid waste of CPU cycles, for both leader and followers. When clients use leader_only mode, this patch completely eliminates the prereq check failures. In scale test of OVN with 1k HVs and creating and binding 10k lports, the patch resulted in 90% CPU cost reduction on leader and >80% CPU cost reduction on followers. (The test was with leader election base time set to 10000ms, because otherwise the test couldn't complete because of the frequent leader re-election.) This is just one of the related performance problems of the prereq checking mechanism dicussed at: https://mail.openvswitch.org/pipermail/ovs-discuss/2019-February/048243.html Signed-off-by: Han Zhou <hzhou8@ebay.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-03-01 10:56:37 -08:00
if (!ovsdb_txn_precheck_prereq(t->db)) {
return false;
}
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
bool durable, forwarding_needed;
struct json *result;
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
/* Trying to compose transaction. */
txn = ovsdb_execute_compose(
t->db, t->session, t->request->params, t->read_only,
t->role, t->id, now - t->created, &t->timeout_msec,
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
&durable, &forwarding_needed, &result);
if (!txn) {
if (result) {
/* Complete. There was an error but we still represent it
* in JSON-RPC as a successful result. */
trigger_success(t, result);
} else {
/* Unsatisfied "wait" condition. Take no action now, retry
* later. */
}
return false;
}
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
if (forwarding_needed) {
/* Transaction is good, but we don't need it. */
ovsdb_txn_abort(txn);
json_destroy(result);
/* Transition to "forwarding" state. */
t->txn_forward = ovsdb_txn_forward_create(t->db, t->request);
/* Forward will not be completed immediately. Will check
* next time. */
return false;
} else {
/* Transition to "committing" state. */
t->reply = jsonrpc_create_reply(result, t->request->id);
t->progress = ovsdb_txn_propose_commit(txn, durable);
}
} else if (!strcmp(t->request->method, "convert")) {
/* Permission check. */
if (t->role && *t->role) {
trigger_convert_error(
t, ovsdb_perm_error(
"RBAC rules for client \"%s\" role \"%s\" prohibit "
"\"convert\" of database %s "
"(only the root role may convert databases)",
t->id, t->role, t->db->schema->name));
return false;
}
if (t->read_only) {
trigger_convert_error(
t, ovsdb_error("not allowed", "conversion is not allowed "
"for read-only database %s",
t->db->schema->name));
return false;
}
/* Validate parameters. */
const struct json *params = t->request->params;
if (params->type != JSON_ARRAY || params->array.n != 2) {
trigger_convert_error(t, ovsdb_syntax_error(params, NULL,
"array expected"));
return false;
}
/* Parse new schema and make a converted copy. */
const struct json *new_schema_json = params->array.elems[1];
struct ovsdb_schema *new_schema;
struct ovsdb_error *error
= ovsdb_schema_from_json(new_schema_json, &new_schema);
if (!error && strcmp(new_schema->name, t->db->schema->name)) {
error = ovsdb_error("invalid parameters",
"new schema name (%s) does not match "
"database name (%s)",
new_schema->name, t->db->schema->name);
}
if (!error) {
ovsdb_destroy(t->converted_db);
error = ovsdb_convert(t->db, new_schema, &t->converted_db);
}
if (error) {
ovsdb_schema_destroy(new_schema);
trigger_convert_error(t, error);
return false;
}
ovsdb: Perform conversion with no data for clustered databases. Currently, database schema conversion in case of clustered database produces a transaction record with both new schema and converted database data. So, the sequence of events is following: 1. Get the new schema. 2. Convert the database to a new schema. 3. Translate the newly converted database into JSON. 4. Write the schema + data JSON to the storage. 5. Destroy converted version of a database. 6. Read schema + data JSON from the storage and parse. 7. Create a new database from a parsed database data. 8. Replace current database with the new one. Most of these steps are very computationally expensive. Also, conversion to/from JSON is much more expensive than direct database conversion with ovsdb_convert() that can make use of shallow data copies. Instead of doing all that, let's make use of previously introduced ability to not write the converted data into the storage. The process will look like this then: 1. Get the new schema. 2. Convert the database to a new schema (to verify that it is possible). 3. Write the schema to the storage. 4. Destroy converted version of a database. 5. Read the new schema from the storage and parse. 6. Convert the database to a new schema. 7. Replace current database with the new one. Most of the operations here are performed on the small schema object, instead of the actual database data. Two remaining data operations (actual conversion) are noticeably faster than conversion to/from JSON due to reference counting and shallow data copies. Steps 4-6 can be optimized later to not convert twice on the process that initiates the conversion. The change results in following performance improvements in conversion of OVN_Southbound database schema from version 20.23.0 to 20.27.0 (measured on a single-server RAFT cluster with no clients): | Before | After +---------+-------------------+---------+------------------ DB size | Total | Max poll interval | Total | Max poll interval --------+---------+-------------------+---------+------------------ 542 MB | 47 sec. | 26 sec. | 15 sec. | 10 sec. 225 MB | 19 sec. | 10 sec. | 6 sec. | 4.5 sec. 542 MB database had 19.5 M atoms, 225 MB database had 7.5 M atoms. Overall performance improvement is about 3x. Also, note that before this change database conversion basically doubles the database file on disk. Now it only writes a small schema JSON. Since the change requires backward-incompatible database file format changes, documentation is updated on how to perform an upgrade. Handled the same way as we did for the previous incompatible format change in 2.15 (column diffs). Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-December/052140.html Reviewed-by: Simon Horman <simon.horman@corigine.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-03-27 21:42:59 +02:00
struct json *txn_json;
if (ovsdb_conversion_with_no_data_supported(t->db)) {
txn_json = json_null_create();
} else {
/* Make the new copy into a transaction log record. */
txn_json = ovsdb_to_txn_json(
t->converted_db, "converted by ovsdb-server", true);
ovsdb: Perform conversion with no data for clustered databases. Currently, database schema conversion in case of clustered database produces a transaction record with both new schema and converted database data. So, the sequence of events is following: 1. Get the new schema. 2. Convert the database to a new schema. 3. Translate the newly converted database into JSON. 4. Write the schema + data JSON to the storage. 5. Destroy converted version of a database. 6. Read schema + data JSON from the storage and parse. 7. Create a new database from a parsed database data. 8. Replace current database with the new one. Most of these steps are very computationally expensive. Also, conversion to/from JSON is much more expensive than direct database conversion with ovsdb_convert() that can make use of shallow data copies. Instead of doing all that, let's make use of previously introduced ability to not write the converted data into the storage. The process will look like this then: 1. Get the new schema. 2. Convert the database to a new schema (to verify that it is possible). 3. Write the schema to the storage. 4. Destroy converted version of a database. 5. Read the new schema from the storage and parse. 6. Convert the database to a new schema. 7. Replace current database with the new one. Most of the operations here are performed on the small schema object, instead of the actual database data. Two remaining data operations (actual conversion) are noticeably faster than conversion to/from JSON due to reference counting and shallow data copies. Steps 4-6 can be optimized later to not convert twice on the process that initiates the conversion. The change results in following performance improvements in conversion of OVN_Southbound database schema from version 20.23.0 to 20.27.0 (measured on a single-server RAFT cluster with no clients): | Before | After +---------+-------------------+---------+------------------ DB size | Total | Max poll interval | Total | Max poll interval --------+---------+-------------------+---------+------------------ 542 MB | 47 sec. | 26 sec. | 15 sec. | 10 sec. 225 MB | 19 sec. | 10 sec. | 6 sec. | 4.5 sec. 542 MB database had 19.5 M atoms, 225 MB database had 7.5 M atoms. Overall performance improvement is about 3x. Also, note that before this change database conversion basically doubles the database file on disk. Now it only writes a small schema JSON. Since the change requires backward-incompatible database file format changes, documentation is updated on how to perform an upgrade. Handled the same way as we did for the previous incompatible format change in 2.15 (column diffs). Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-December/052140.html Reviewed-by: Simon Horman <simon.horman@corigine.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-03-27 21:42:59 +02:00
}
/* Propose the change. */
t->progress = ovsdb_txn_propose_schema_change(
t->db, new_schema, txn_json, &t->conversion_txnid);
ovsdb_schema_destroy(new_schema);
json_destroy(txn_json);
t->reply = jsonrpc_create_reply(json_object_create(),
t->request->id);
} else {
OVS_NOT_REACHED();
}
/* If the transaction committed synchronously, complete it and
* transition to "complete". This is more than an optimization because
* the file-based storage isn't implemented to read back the
* transactions that we write (which is an ugly broken abstraction but
* it's what we have). */
if (ovsdb_txn_progress_is_complete(t->progress)
&& !ovsdb_txn_progress_get_error(t->progress)) {
if (txn) {
ovsdb_txn_complete(txn);
}
ovsdb_txn_progress_destroy(t->progress);
t->progress = NULL;
ovsdb_trigger_complete(t);
if (t->converted_db) {
ovsdb_replace(t->db, t->converted_db);
t->converted_db = NULL;
return true;
}
return false;
}
/* Fall through to the general handling for the "committing" state. We
* abort the transaction--if and when it eventually commits, we'll read
* it back from storage and replay it locally. */
if (txn) {
ovsdb_txn_abort(txn);
}
}
/* Handle "committing" state. */
if (t->progress) {
if (!ovsdb_txn_progress_is_complete(t->progress)) {
return false;
}
/* Transition to "complete". */
struct ovsdb_error *error
= ovsdb_error_clone(ovsdb_txn_progress_get_error(t->progress));
ovsdb_txn_progress_destroy(t->progress);
t->progress = NULL;
if (error) {
if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) {
/* Temporary error. Transition back to "initialized" state to
* try again. */
raft: Avoid busy loop during leader election. When a server doesn't see a leader yet, e.g. during leader re-election, if a transaction comes from a client, it will cause 100% CPU busy loop. With debug log enabled it is like: 2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 ... The problem is that in ovsdb_trigger_try(), all cluster errors are treated as temporary error and retry immediately. This patch fixes it by introducing 'run_triggers_now', which tells if a retry is needed immediately. When the cluster error is with detail 'not leader', we don't immediately retry, but will wait for the next poll event to trigger the retry. When 'not leader' status changes, there must be a event, i.e. raft RPC that changes the status, so the trigger is guaranteed to be triggered, without busy loop. Signed-off-by: Han Zhou <hzhou@ovn.org> Signed-off-by: Ben Pfaff <blp@ovn.org>
2020-02-28 18:07:07 -08:00
char *err_s = ovsdb_error_to_string(error);
VLOG_DBG("cluster error %s", err_s);
jsonrpc_msg_destroy(t->reply);
t->reply = NULL;
t->db->run_triggers = true;
raft: Avoid busy loop during leader election. When a server doesn't see a leader yet, e.g. during leader re-election, if a transaction comes from a client, it will cause 100% CPU busy loop. With debug log enabled it is like: 2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 ... The problem is that in ovsdb_trigger_try(), all cluster errors are treated as temporary error and retry immediately. This patch fixes it by introducing 'run_triggers_now', which tells if a retry is needed immediately. When the cluster error is with detail 'not leader', we don't immediately retry, but will wait for the next poll event to trigger the retry. When 'not leader' status changes, there must be a event, i.e. raft RPC that changes the status, so the trigger is guaranteed to be triggered, without busy loop. Signed-off-by: Han Zhou <hzhou@ovn.org> Signed-off-by: Ben Pfaff <blp@ovn.org>
2020-02-28 18:07:07 -08:00
if (!strstr(err_s, "not leader")) {
t->db->run_triggers_now = true;
}
free(err_s);
ovsdb_error_destroy(error);
} else {
/* Permanent error. Transition to "completed" state to report
* it. */
if (!strcmp(t->request->method, "transact")) {
json_array_add(t->reply->result,
ovsdb_error_to_json_free(error));
ovsdb_trigger_complete(t);
} else if (!strcmp(t->request->method, "convert")) {
jsonrpc_msg_destroy(t->reply);
t->reply = NULL;
trigger_convert_error(t, error);
}
}
} else {
/* Success. */
ovsdb_trigger_complete(t);
}
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
return false;
} else if (t->txn_forward) {
/* Handle "forwarding" state. */
if (!ovsdb_txn_forward_is_complete(t->txn_forward)) {
return false;
}
/* Transition to "complete". */
ovs_assert(!t->reply);
t->reply = ovsdb_txn_forward_steal_reply(t->txn_forward);
ovsdb_txn_forward_destroy(t->db, t->txn_forward);
t->txn_forward = NULL;
ovsdb_trigger_complete(t);
return false;
2009-11-04 15:11:44 -08:00
}
OVS_NOT_REACHED();
2009-11-04 15:11:44 -08:00
}
static void
ovsdb_trigger_complete(struct ovsdb_trigger *t)
2009-11-04 15:11:44 -08:00
{
ovs_assert(t->reply);
ovs_list_remove(&t->node);
ovs_list_push_back(&t->session->completions, &t->node);
2009-11-04 15:11:44 -08:00
}
/* Makes a "convert" request into an error.
*
* This is not suitable for "transact" requests because their replies should
* never be bare ovsdb_errors: RFC 7047 says that their replies must either be
* a JSON-RPC reply that contains an array of operation replies (which can be
* errors), or a JSON-RPC error whose "error" member is simply "canceled". */
static void
trigger_convert_error(struct ovsdb_trigger *t, struct ovsdb_error *error)
{
ovs_assert(!strcmp(t->request->method, "convert"));
ovs_assert(error && !t->reply);
t->reply = jsonrpc_create_error(
ovsdb_error_to_json_free(error), t->request->id);
ovsdb_trigger_complete(t);
}
static void
trigger_success(struct ovsdb_trigger *t, struct json *result)
{
ovs_assert(result && !t->reply);
t->reply = jsonrpc_create_reply(result, t->request->id);
ovsdb_trigger_complete(t);
}