2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-30 22:05:19 +00:00
Files
ovs/ovsdb/trigger.c
Han Zhou 2cd62f75c1 ovsdb raft: Precheck prereq before proposing commit.
In current OVSDB Raft design, when there are multiple transactions
pending, either from same server node or different nodes in the
cluster, only the first one can be successful at once, and following
ones will fail at the prerequisite check on leader node, because
the first one will update the expected prerequisite eid on leader
node, and the prerequisite used for proposing a commit has to be
committed eid, so it is not possible for a node to use the latest
prerequisite expected by the leader to propose a commit until the
lastest transaction is committed by the leader and updated the
committed_index on the node.

Current implementation proposes the commit as soon as the transaction
is requested by the client, which results in continously retry which
causes high CPU load and waste.

Particularly, even if all clients are using leader_only to connect to
only the leader, the prereq check failure still happens a lot when
a batch of transactions are pending on the leader node - the leader
node proposes a batch of commits using the same committed eid as
prerequisite and it updates the expected prereq as soon as the first
one is in progress, but it needs time to append to followers and wait
until majority replies to update the committed_index, which results in
continously useless retries of the following transactions proposed by
the leader itself.

This patch doesn't change the design but simplely pre-checks if current
eid is same as prereq, before proposing the commit, to avoid waste of
CPU cycles, for both leader and followers. When clients use leader_only
mode, this patch completely eliminates the prereq check failures.

In scale test of OVN with 1k HVs and creating and binding 10k lports,
the patch resulted in 90% CPU cost reduction on leader and >80% CPU cost
reduction on followers. (The test was with leader election base time
set to 10000ms, because otherwise the test couldn't complete because
of the frequent leader re-election.)

This is just one of the related performance problems of the prereq
checking mechanism dicussed at:

https://mail.openvswitch.org/pipermail/ovs-discuss/2019-February/048243.html
Signed-off-by: Han Zhou <hzhou8@ebay.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-03-07 14:23:51 -08:00

381 lines
13 KiB
C

/* Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "trigger.h"
#include <limits.h>
#include <string.h>
#include "file.h"
#include "openvswitch/json.h"
#include "jsonrpc.h"
#include "ovsdb.h"
#include "ovsdb-error.h"
#include "openvswitch/poll-loop.h"
#include "server.h"
#include "transaction.h"
#include "openvswitch/vlog.h"
#include "util.h"
VLOG_DEFINE_THIS_MODULE(trigger);
static bool ovsdb_trigger_try(struct ovsdb_trigger *, long long int now);
static void ovsdb_trigger_complete(struct ovsdb_trigger *);
static void trigger_convert_error(struct ovsdb_trigger *,
struct ovsdb_error *);
static void trigger_success(struct ovsdb_trigger *, struct json *result);
bool
ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db,
struct ovsdb_trigger *trigger,
struct jsonrpc_msg *request, long long int now,
bool read_only, const char *role, const char *id)
{
ovs_assert(!strcmp(request->method, "transact") ||
!strcmp(request->method, "convert"));
trigger->session = session;
trigger->db = db;
ovs_list_push_back(&trigger->db->triggers, &trigger->node);
trigger->request = request;
trigger->reply = NULL;
trigger->progress = NULL;
trigger->created = now;
trigger->timeout_msec = LLONG_MAX;
trigger->read_only = read_only;
trigger->role = nullable_xstrdup(role);
trigger->id = nullable_xstrdup(id);
return ovsdb_trigger_try(trigger, now);
}
void
ovsdb_trigger_destroy(struct ovsdb_trigger *trigger)
{
ovsdb_txn_progress_destroy(trigger->progress);
ovs_list_remove(&trigger->node);
jsonrpc_msg_destroy(trigger->request);
jsonrpc_msg_destroy(trigger->reply);
free(trigger->role);
free(trigger->id);
}
bool
ovsdb_trigger_is_complete(const struct ovsdb_trigger *trigger)
{
return trigger->reply && !trigger->progress;
}
struct jsonrpc_msg *
ovsdb_trigger_steal_reply(struct ovsdb_trigger *trigger)
{
struct jsonrpc_msg *reply = trigger->reply;
trigger->reply = NULL;
return reply;
}
/* Cancels 'trigger'. 'reason' should be a human-readable reason for log
* messages etc. */
void
ovsdb_trigger_cancel(struct ovsdb_trigger *trigger, const char *reason)
{
if (trigger->progress) {
/* The transaction still might complete asynchronously, but we can stop
* tracking it. */
ovsdb_txn_progress_destroy(trigger->progress);
trigger->progress = NULL;
}
jsonrpc_msg_destroy(trigger->reply);
trigger->reply = NULL;
if (!strcmp(trigger->request->method, "transact")) {
/* There's no place to stick 'reason' into the error reply because RFC
* 7047 prescribes a fix form for these messages, see section 4.1.4. */
trigger->reply = jsonrpc_create_error(json_string_create("canceled"),
trigger->request->id);
ovsdb_trigger_complete(trigger);
} else if (!strcmp(trigger->request->method, "convert")) {
trigger_convert_error(
trigger,
ovsdb_error("canceled", "database conversion canceled because %s",
reason));
}
}
void
ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger)
{
if (!ovsdb_trigger_is_complete(trigger)) {
if (!strcmp(trigger->request->method, "transact")) {
ovsdb_trigger_cancel(trigger, "database schema is changing");
} else if (!strcmp(trigger->request->method, "convert")) {
/* We don't cancel "convert" requests when a database is being
* replaced for two reasons. First, we expect the administrator to
* do some kind of sensible synchronization on conversion requests,
* that is, it only really makes sense for the admin to do a single
* conversion at a time at a scheduled point. Second, if we did
* then every "convert" request would end up getting canceled since
* "convert" itself causes the database to be replaced. */
} else {
OVS_NOT_REACHED();
}
}
}
bool
ovsdb_trigger_run(struct ovsdb *db, long long int now)
{
struct ovsdb_trigger *t, *next;
bool run_triggers = db->run_triggers;
db->run_triggers = false;
bool disconnect_all = false;
LIST_FOR_EACH_SAFE (t, next, node, &db->triggers) {
if (run_triggers
|| now - t->created >= t->timeout_msec
|| t->progress) {
if (ovsdb_trigger_try(t, now)) {
disconnect_all = true;
}
}
}
return disconnect_all;
}
void
ovsdb_trigger_wait(struct ovsdb *db, long long int now)
{
if (db->run_triggers) {
poll_immediate_wake();
} else {
long long int deadline = LLONG_MAX;
struct ovsdb_trigger *t;
LIST_FOR_EACH (t, node, &db->triggers) {
if (t->created < LLONG_MAX - t->timeout_msec) {
long long int t_deadline = t->created + t->timeout_msec;
if (deadline > t_deadline) {
deadline = t_deadline;
if (now >= deadline) {
break;
}
}
}
}
if (deadline < LLONG_MAX) {
poll_timer_wait_until(deadline);
}
}
}
static bool
ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now)
{
/* Handle "initialized" state. */
if (!t->reply) {
ovs_assert(!t->progress);
struct ovsdb_txn *txn = NULL;
struct ovsdb *newdb = NULL;
if (!strcmp(t->request->method, "transact")) {
if (!ovsdb_txn_precheck_prereq(t->db)) {
return false;
}
bool durable;
struct json *result;
txn = ovsdb_execute_compose(
t->db, t->session, t->request->params, t->read_only,
t->role, t->id, now - t->created, &t->timeout_msec,
&durable, &result);
if (!txn) {
if (result) {
/* Complete. There was an error but we still represent it
* in JSON-RPC as a successful result. */
trigger_success(t, result);
} else {
/* Unsatisfied "wait" condition. Take no action now, retry
* later. */
}
return false;
}
/* Transition to "committing" state. */
t->reply = jsonrpc_create_reply(result, t->request->id);
t->progress = ovsdb_txn_propose_commit(txn, durable);
} else if (!strcmp(t->request->method, "convert")) {
/* Permission check. */
if (t->role && *t->role) {
trigger_convert_error(
t, ovsdb_perm_error(
"RBAC rules for client \"%s\" role \"%s\" prohibit "
"\"convert\" of database %s "
"(only the root role may convert databases)",
t->id, t->role, t->db->schema->name));
return false;
}
/* Validate parameters. */
const struct json *params = t->request->params;
if (params->type != JSON_ARRAY || params->array.n != 2) {
trigger_convert_error(t, ovsdb_syntax_error(params, NULL,
"array expected"));
return false;
}
/* Parse new schema and make a converted copy. */
const struct json *new_schema_json = params->array.elems[1];
struct ovsdb_schema *new_schema;
struct ovsdb_error *error
= ovsdb_schema_from_json(new_schema_json, &new_schema);
if (!error && strcmp(new_schema->name, t->db->schema->name)) {
error = ovsdb_error("invalid parameters",
"new schema name (%s) does not match "
"database name (%s)",
new_schema->name, t->db->schema->name);
}
if (!error) {
error = ovsdb_convert(t->db, new_schema, &newdb);
}
if (error) {
ovsdb_schema_destroy(new_schema);
trigger_convert_error(t, error);
return false;
}
/* Make the new copy into a transaction log record. */
struct json *txn_json = ovsdb_to_txn_json(
newdb, "converted by ovsdb-server");
/* Propose the change. */
t->progress = ovsdb_txn_propose_schema_change(
t->db, new_schema_json, txn_json);
json_destroy(txn_json);
t->reply = jsonrpc_create_reply(json_object_create(),
t->request->id);
} else {
OVS_NOT_REACHED();
}
/* If the transaction committed synchronously, complete it and
* transition to "complete". This is more than an optimization because
* the file-based storage isn't implemented to read back the
* transactions that we write (which is an ugly broken abstraction but
* it's what we have). */
if (ovsdb_txn_progress_is_complete(t->progress)
&& !ovsdb_txn_progress_get_error(t->progress)) {
if (txn) {
ovsdb_txn_complete(txn);
}
ovsdb_txn_progress_destroy(t->progress);
t->progress = NULL;
ovsdb_trigger_complete(t);
if (newdb) {
ovsdb_replace(t->db, newdb);
return true;
}
return false;
}
ovsdb_destroy(newdb);
/* Fall through to the general handling for the "committing" state. We
* abort the transaction--if and when it eventually commits, we'll read
* it back from storage and replay it locally. */
if (txn) {
ovsdb_txn_abort(txn);
}
}
/* Handle "committing" state. */
if (t->progress) {
if (!ovsdb_txn_progress_is_complete(t->progress)) {
return false;
}
/* Transition to "complete". */
struct ovsdb_error *error
= ovsdb_error_clone(ovsdb_txn_progress_get_error(t->progress));
ovsdb_txn_progress_destroy(t->progress);
t->progress = NULL;
if (error) {
if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) {
/* Temporary error. Transition back to "initialized" state to
* try again. */
jsonrpc_msg_destroy(t->reply);
t->reply = NULL;
t->db->run_triggers = true;
ovsdb_error_destroy(error);
} else {
/* Permanent error. Transition to "completed" state to report
* it. */
if (!strcmp(t->request->method, "transact")) {
json_array_add(t->reply->result,
ovsdb_error_to_json_free(error));
ovsdb_trigger_complete(t);
} else if (!strcmp(t->request->method, "convert")) {
jsonrpc_msg_destroy(t->reply);
t->reply = NULL;
trigger_convert_error(t, error);
}
}
} else {
/* Success. */
ovsdb_trigger_complete(t);
}
return false;
}
OVS_NOT_REACHED();
}
static void
ovsdb_trigger_complete(struct ovsdb_trigger *t)
{
ovs_assert(t->reply);
ovs_list_remove(&t->node);
ovs_list_push_back(&t->session->completions, &t->node);
}
/* Makes a "convert" request into an error.
*
* This is not suitable for "transact" requests because their replies should
* never be bare ovsdb_errors: RFC 7047 says that their replies must either be
* a JSON-RPC reply that contains an array of operation replies (which can be
* errors), or a JSON-RPC error whose "error" member is simply "canceled". */
static void
trigger_convert_error(struct ovsdb_trigger *t, struct ovsdb_error *error)
{
ovs_assert(!strcmp(t->request->method, "convert"));
ovs_assert(error && !t->reply);
t->reply = jsonrpc_create_error(
ovsdb_error_to_json_free(error), t->request->id);
ovsdb_trigger_complete(t);
}
static void
trigger_success(struct ovsdb_trigger *t, struct json *result)
{
ovs_assert(result && !t->reply);
t->reply = jsonrpc_create_reply(result, t->request->id);
ovsdb_trigger_complete(t);
}