2017-12-31 21:15:58 -08:00
|
|
|
|
|
|
|
/* Copyright (c) 2009, 2010, 2011, 2016, 2017 Nicira, Inc.
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this storage except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at:
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
|
|
|
#include "storage.h"
|
|
|
|
#include <string.h>
|
|
|
|
#include "log.h"
|
|
|
|
#include "ovsdb-error.h"
|
|
|
|
#include "openvswitch/json.h"
|
|
|
|
#include "openvswitch/poll-loop.h"
|
|
|
|
#include "openvswitch/vlog.h"
|
|
|
|
#include "ovsdb.h"
|
|
|
|
#include "raft.h"
|
|
|
|
#include "random.h"
|
2020-05-22 18:31:19 +02:00
|
|
|
#include "simap.h"
|
2017-12-31 21:15:58 -08:00
|
|
|
#include "timeval.h"
|
|
|
|
#include "util.h"
|
|
|
|
|
|
|
|
VLOG_DEFINE_THIS_MODULE(storage);
|
|
|
|
|
|
|
|
struct ovsdb_storage {
|
|
|
|
/* There are three kinds of storage:
|
|
|
|
*
|
|
|
|
* - Standalone, backed by a disk file. 'log' is nonnull, 'raft' is
|
|
|
|
* null.
|
|
|
|
*
|
|
|
|
* - Clustered, backed by a Raft cluster. 'log' is null, 'raft' is
|
|
|
|
* nonnull.
|
|
|
|
*
|
|
|
|
* - Memory only, unbacked. 'log' and 'raft' are null. */
|
|
|
|
struct ovsdb_log *log;
|
|
|
|
struct raft *raft;
|
|
|
|
|
2021-06-01 21:52:08 +02:00
|
|
|
char *unbacked_name; /* Name of the unbacked storage. */
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
/* All kinds of storage. */
|
|
|
|
struct ovsdb_error *error; /* If nonnull, a permanent error. */
|
|
|
|
long long next_snapshot_min; /* Earliest time to take next snapshot. */
|
|
|
|
long long next_snapshot_max; /* Latest time to take next snapshot. */
|
|
|
|
|
|
|
|
/* Standalone only. */
|
|
|
|
unsigned int n_read;
|
|
|
|
unsigned int n_written;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void schedule_next_snapshot(struct ovsdb_storage *, bool quick);
|
|
|
|
|
|
|
|
static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_open__(const char *filename, bool rw, bool allow_clustered,
|
|
|
|
struct ovsdb_storage **storagep)
|
|
|
|
{
|
|
|
|
*storagep = NULL;
|
|
|
|
|
|
|
|
struct ovsdb_log *log;
|
|
|
|
struct ovsdb_error *error;
|
|
|
|
error = ovsdb_log_open(filename, OVSDB_MAGIC"|"RAFT_MAGIC,
|
|
|
|
rw ? OVSDB_LOG_READ_WRITE : OVSDB_LOG_READ_ONLY,
|
|
|
|
-1, &log);
|
|
|
|
if (error) {
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct raft *raft = NULL;
|
|
|
|
if (!strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC)) {
|
|
|
|
if (!allow_clustered) {
|
|
|
|
ovsdb_log_close(log);
|
|
|
|
return ovsdb_error(NULL, "%s: cannot apply this operation to "
|
|
|
|
"clustered database file", filename);
|
|
|
|
}
|
|
|
|
error = raft_open(log, &raft);
|
|
|
|
log = NULL;
|
|
|
|
if (error) {
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ovsdb_storage *storage = xzalloc(sizeof *storage);
|
|
|
|
storage->log = log;
|
|
|
|
storage->raft = raft;
|
|
|
|
schedule_next_snapshot(storage, false);
|
|
|
|
*storagep = storage;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Opens 'filename' for use as storage. If 'rw', opens it for read/write
|
|
|
|
* access, otherwise read-only. If successful, stores the new storage in
|
|
|
|
* '*storagep' and returns NULL; on failure, stores NULL in '*storagep' and
|
|
|
|
* returns the error.
|
|
|
|
*
|
|
|
|
* The returned storage might be clustered or standalone, depending on what the
|
|
|
|
* disk file contains. */
|
|
|
|
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_open(const char *filename, bool rw,
|
|
|
|
struct ovsdb_storage **storagep)
|
|
|
|
{
|
|
|
|
return ovsdb_storage_open__(filename, rw, true, storagep);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ovsdb_storage *
|
|
|
|
ovsdb_storage_open_standalone(const char *filename, bool rw)
|
|
|
|
{
|
|
|
|
struct ovsdb_storage *storage;
|
|
|
|
struct ovsdb_error *error = ovsdb_storage_open__(filename, rw, false,
|
|
|
|
&storage);
|
|
|
|
if (error) {
|
|
|
|
ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
|
|
|
|
}
|
|
|
|
return storage;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Creates and returns new storage without any backing. Nothing will be read
|
2021-06-01 21:52:08 +02:00
|
|
|
* from the storage, and writes are discarded. If 'name' is nonnull, it will
|
|
|
|
* be used as a storage name. */
|
2017-12-31 21:15:58 -08:00
|
|
|
struct ovsdb_storage *
|
2021-06-01 21:52:08 +02:00
|
|
|
ovsdb_storage_create_unbacked(const char *name)
|
2017-12-31 21:15:58 -08:00
|
|
|
{
|
|
|
|
struct ovsdb_storage *storage = xzalloc(sizeof *storage);
|
|
|
|
schedule_next_snapshot(storage, false);
|
2021-06-01 21:52:08 +02:00
|
|
|
storage->unbacked_name = nullable_xstrdup(name);
|
2017-12-31 21:15:58 -08:00
|
|
|
return storage;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ovsdb_storage_close(struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
if (storage) {
|
|
|
|
ovsdb_log_close(storage->log);
|
|
|
|
raft_close(storage->raft);
|
|
|
|
ovsdb_error_destroy(storage->error);
|
2021-06-01 21:52:08 +02:00
|
|
|
free(storage->unbacked_name);
|
2017-12-31 21:15:58 -08:00
|
|
|
free(storage);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *
|
|
|
|
ovsdb_storage_get_model(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return storage->raft ? "clustered" : "standalone";
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
ovsdb_storage_is_clustered(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return storage->raft != NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
ovsdb_storage_is_connected(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return !storage->raft || raft_is_connected(storage->raft);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
ovsdb_storage_is_dead(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return storage->raft && raft_left(storage->raft);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
ovsdb_storage_is_leader(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return !storage->raft || raft_is_leader(storage->raft);
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct uuid *
|
|
|
|
ovsdb_storage_get_cid(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return storage->raft ? raft_get_cid(storage->raft) : NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct uuid *
|
|
|
|
ovsdb_storage_get_sid(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return storage->raft ? raft_get_sid(storage->raft) : NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return storage->raft ? raft_get_applied_index(storage->raft) : 0;
|
|
|
|
}
|
|
|
|
|
2020-05-22 18:31:19 +02:00
|
|
|
void
|
|
|
|
ovsdb_storage_get_memory_usage(const struct ovsdb_storage *storage,
|
|
|
|
struct simap *usage)
|
|
|
|
{
|
|
|
|
if (storage->raft) {
|
|
|
|
raft_get_memory_usage(storage->raft, usage);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-03 17:05:28 +02:00
|
|
|
char *
|
|
|
|
ovsdb_storage_get_error(const struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
if (storage->error) {
|
|
|
|
return ovsdb_error_to_string(storage->error);
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
void
|
|
|
|
ovsdb_storage_run(struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
if (storage->raft) {
|
|
|
|
raft_run(storage->raft);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ovsdb_storage_wait(struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
if (storage->raft) {
|
|
|
|
raft_wait(storage->raft);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns 'storage''s embedded name, if it has one, otherwise null.
|
|
|
|
*
|
|
|
|
* Only clustered storage has a built-in name. */
|
|
|
|
const char *
|
|
|
|
ovsdb_storage_get_name(const struct ovsdb_storage *storage)
|
|
|
|
{
|
2021-06-01 21:52:08 +02:00
|
|
|
return storage->unbacked_name ? storage->unbacked_name
|
|
|
|
: storage->raft ? raft_get_name(storage->raft)
|
|
|
|
: NULL;
|
2017-12-31 21:15:58 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Attempts to read a log record from 'storage'.
|
|
|
|
*
|
|
|
|
* If successful, returns NULL and stores the transaction information in
|
|
|
|
* '*schemap', '*txnp', and '*txnid'. At least one of these will be nonnull.
|
|
|
|
* The caller owns the data and must eventually free it (with json_destroy()).
|
|
|
|
*
|
|
|
|
* If 'storage' is not clustered, 'txnid' may be null.
|
|
|
|
*
|
|
|
|
* If a read error occurs, returns the error and stores NULL in '*jsonp'.
|
|
|
|
*
|
|
|
|
* If the read reaches end of file, returns NULL and stores NULL in
|
|
|
|
* '*jsonp'. */
|
|
|
|
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_read(struct ovsdb_storage *storage,
|
|
|
|
struct ovsdb_schema **schemap,
|
|
|
|
struct json **txnp,
|
|
|
|
struct uuid *txnid)
|
|
|
|
{
|
|
|
|
*schemap = NULL;
|
|
|
|
*txnp = NULL;
|
|
|
|
if (txnid) {
|
|
|
|
*txnid = UUID_ZERO;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct json *json;
|
|
|
|
struct json *schema_json = NULL;
|
|
|
|
struct json *txn_json = NULL;
|
|
|
|
if (storage->raft) {
|
raft: Don't keep full json objects in memory if no longer needed.
Raft log entries (and raft database snapshot) contains json objects
of the data. Follower receives append requests with data that gets
parsed and added to the raft log. Leader receives execution requests,
parses data out of them and adds to the log. In both cases, later
ovsdb-server reads the log with ovsdb_storage_read(), constructs
transaction and updates the database. On followers these json objects
in common case are never used again. Leader may use them to send
append requests or snapshot installation requests to followers.
However, all these operations (except for ovsdb_storage_read()) are
just serializing the json in order to send it over the network.
Json objects are significantly larger than their serialized string
representation. For example, the snapshot of the database from one of
the ovn-heater scale tests takes 270 MB as a string, but 1.6 GB as
a json object from the total 3.8 GB consumed by ovsdb-server process.
ovsdb_storage_read() for a given raft entry happens only once in a
lifetime, so after this call, we can serialize the json object, store
the string representation and free the actual json object that ovsdb
will never need again. This can save a lot of memory and can also
save serialization time, because each raft entry for append requests
and snapshot installation requests serialized only once instead of
doing that every time such request needs to be sent.
JSON_SERIALIZED_OBJECT can be used in order to seamlessly integrate
pre-serialized data into raft_header and similar json objects.
One major special case is creation of a database snapshot.
Snapshot installation request received over the network will be parsed
and read by ovsdb-server just like any other raft log entry. However,
snapshots created locally with raft_store_snapshot() will never be
read back, because they reflect the current state of the database,
hence already applied. For this case we can free the json object
right after writing snapshot on disk.
Tests performed with ovn-heater on 60 node density-light scenario,
where on-disk database goes up to 97 MB, shows average memory
consumption of ovsdb-server Southbound DB processes decreased by 58%
(from 602 MB to 256 MB per process) and peak memory consumption
decreased by 40% (from 1288 MB to 771 MB).
Test with 120 nodes on density-heavy scenario with 270 MB on-disk
database shows 1.5 GB memory consumption decrease as expected.
Also, total CPU time consumed by the Southbound DB process reduced
from 296 to 256 minutes. Number of unreasonably long poll intervals
reduced from 2896 down to 1934.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-24 21:00:38 +02:00
|
|
|
json = raft_next_entry(storage->raft, txnid);
|
2017-12-31 21:15:58 -08:00
|
|
|
if (!json) {
|
|
|
|
return NULL;
|
2018-05-24 10:32:59 -07:00
|
|
|
} else if (json->type != JSON_ARRAY || json->array.n != 2) {
|
2017-12-31 21:15:58 -08:00
|
|
|
json_destroy(json);
|
|
|
|
return ovsdb_error(NULL, "invalid commit format");
|
|
|
|
}
|
|
|
|
|
2018-05-24 10:32:59 -07:00
|
|
|
struct json **e = json->array.elems;
|
2017-12-31 21:15:58 -08:00
|
|
|
schema_json = e[0]->type != JSON_NULL ? e[0] : NULL;
|
|
|
|
txn_json = e[1]->type != JSON_NULL ? e[1] : NULL;
|
|
|
|
} else if (storage->log) {
|
|
|
|
struct ovsdb_error *error = ovsdb_log_read(storage->log, &json);
|
|
|
|
if (error || !json) {
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int n = storage->n_read++;
|
|
|
|
struct json **jsonp = !n ? &schema_json : &txn_json;
|
|
|
|
*jsonp = json;
|
|
|
|
if (n == 1) {
|
|
|
|
ovsdb_log_mark_base(storage->log);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* Unbacked. Nothing to do. */
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If we got this far then we must have at least a schema or a
|
|
|
|
* transaction. */
|
|
|
|
ovs_assert(schema_json || txn_json);
|
|
|
|
|
|
|
|
if (schema_json) {
|
|
|
|
struct ovsdb_schema *schema;
|
|
|
|
struct ovsdb_error *error = ovsdb_schema_from_json(schema_json,
|
|
|
|
&schema);
|
|
|
|
if (error) {
|
|
|
|
json_destroy(json);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *storage_name = ovsdb_storage_get_name(storage);
|
|
|
|
const char *schema_name = schema->name;
|
|
|
|
if (storage_name && strcmp(storage_name, schema_name)) {
|
|
|
|
error = ovsdb_error(NULL, "name %s in header does not match "
|
|
|
|
"name %s in schema",
|
|
|
|
storage_name, schema_name);
|
|
|
|
json_destroy(json);
|
|
|
|
ovsdb_schema_destroy(schema);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
*schemap = schema;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (txn_json) {
|
|
|
|
*txnp = json_clone(txn_json);
|
|
|
|
}
|
|
|
|
|
|
|
|
json_destroy(json);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reads and returns the schema from standalone storage 'storage'. Terminates
|
|
|
|
* with an error on failure. */
|
|
|
|
struct ovsdb_schema *
|
|
|
|
ovsdb_storage_read_schema(struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
ovs_assert(storage->log);
|
|
|
|
|
|
|
|
struct json *txn_json;
|
|
|
|
struct ovsdb_schema *schema;
|
|
|
|
struct ovsdb_error *error = ovsdb_storage_read(storage, &schema,
|
|
|
|
&txn_json, NULL);
|
|
|
|
if (error) {
|
|
|
|
ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
|
|
|
|
}
|
|
|
|
if (!schema && !txn_json) {
|
|
|
|
ovs_fatal(0, "unexpected end of file reading schema");
|
|
|
|
}
|
|
|
|
ovs_assert(schema && !txn_json);
|
|
|
|
|
|
|
|
return schema;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
ovsdb_storage_read_wait(struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
return (storage->raft
|
|
|
|
? raft_has_next_entry(storage->raft)
|
|
|
|
: false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ovsdb_storage_unread(struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
if (storage->error) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (storage->raft) {
|
|
|
|
if (!storage->error) {
|
|
|
|
storage->error = ovsdb_error(NULL, "inconsistent data");
|
|
|
|
}
|
|
|
|
} else if (storage->log) {
|
|
|
|
ovsdb_log_unread(storage->log);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ovsdb_write {
|
|
|
|
struct ovsdb_error *error;
|
|
|
|
struct raft_command *command;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Not suitable for writing transactions that change the schema. */
|
|
|
|
struct ovsdb_write * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_write(struct ovsdb_storage *storage, const struct json *data,
|
|
|
|
const struct uuid *prereq, struct uuid *resultp,
|
|
|
|
bool durable)
|
|
|
|
{
|
|
|
|
struct ovsdb_write *w = xzalloc(sizeof *w);
|
|
|
|
struct uuid result = UUID_ZERO;
|
|
|
|
if (storage->error) {
|
|
|
|
w->error = ovsdb_error_clone(storage->error);
|
|
|
|
} else if (storage->raft) {
|
|
|
|
struct json *txn_json = json_array_create_2(json_null_create(),
|
|
|
|
json_clone(data));
|
|
|
|
w->command = raft_command_execute(storage->raft, txn_json,
|
|
|
|
prereq, &result);
|
|
|
|
json_destroy(txn_json);
|
|
|
|
} else if (storage->log) {
|
|
|
|
w->error = ovsdb_log_write(storage->log, data);
|
|
|
|
if (!w->error) {
|
|
|
|
storage->n_written++;
|
|
|
|
if (durable) {
|
|
|
|
w->error = ovsdb_log_commit_block(storage->log);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* When 'error' and 'command' are both null, it indicates that the
|
|
|
|
* command is complete. This is fine since this unbacked storage drops
|
|
|
|
* writes. */
|
|
|
|
}
|
|
|
|
if (resultp) {
|
|
|
|
*resultp = result;
|
|
|
|
}
|
|
|
|
return w;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Not suitable for writing transactions that change the schema. */
|
|
|
|
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_write_block(struct ovsdb_storage *storage,
|
|
|
|
const struct json *data, const struct uuid *prereq,
|
|
|
|
struct uuid *resultp, bool durable)
|
|
|
|
{
|
|
|
|
struct ovsdb_write *w = ovsdb_storage_write(storage, data,
|
|
|
|
prereq, resultp, durable);
|
|
|
|
while (!ovsdb_write_is_complete(w)) {
|
|
|
|
if (storage->raft) {
|
|
|
|
raft_run(storage->raft);
|
|
|
|
}
|
|
|
|
|
|
|
|
ovsdb_write_wait(w);
|
|
|
|
if (storage->raft) {
|
|
|
|
raft_wait(storage->raft);
|
|
|
|
}
|
|
|
|
poll_block();
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ovsdb_error *error = ovsdb_error_clone(ovsdb_write_get_error(w));
|
|
|
|
ovsdb_write_destroy(w);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
ovsdb_write_is_complete(const struct ovsdb_write *w)
|
|
|
|
{
|
|
|
|
return (w->error
|
|
|
|
|| !w->command
|
|
|
|
|| raft_command_get_status(w->command) != RAFT_CMD_INCOMPLETE);
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct ovsdb_error *
|
|
|
|
ovsdb_write_get_error(const struct ovsdb_write *w_)
|
|
|
|
{
|
|
|
|
struct ovsdb_write *w = CONST_CAST(struct ovsdb_write *, w_);
|
|
|
|
ovs_assert(ovsdb_write_is_complete(w));
|
|
|
|
|
|
|
|
if (w->command && !w->error) {
|
|
|
|
enum raft_command_status status = raft_command_get_status(w->command);
|
|
|
|
if (status != RAFT_CMD_SUCCESS) {
|
|
|
|
w->error = ovsdb_error("cluster error", "%s",
|
|
|
|
raft_command_status_to_string(status));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return w->error;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
ovsdb_write_get_commit_index(const struct ovsdb_write *w)
|
|
|
|
{
|
|
|
|
ovs_assert(ovsdb_write_is_complete(w));
|
|
|
|
return (w->command && !w->error
|
|
|
|
? raft_command_get_commit_index(w->command)
|
|
|
|
: 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ovsdb_write_wait(const struct ovsdb_write *w)
|
|
|
|
{
|
|
|
|
if (ovsdb_write_is_complete(w)) {
|
|
|
|
poll_immediate_wake();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ovsdb_write_destroy(struct ovsdb_write *w)
|
|
|
|
{
|
|
|
|
if (w) {
|
|
|
|
raft_command_unref(w->command);
|
|
|
|
ovsdb_error_destroy(w->error);
|
|
|
|
free(w);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
schedule_next_snapshot(struct ovsdb_storage *storage, bool quick)
|
|
|
|
{
|
|
|
|
if (storage->log || storage->raft) {
|
|
|
|
unsigned int base = 10 * 60 * 1000; /* 10 minutes */
|
|
|
|
unsigned int range = 10 * 60 * 1000; /* 10 minutes */
|
|
|
|
if (quick) {
|
|
|
|
base /= 10;
|
|
|
|
range /= 10;
|
|
|
|
}
|
|
|
|
|
|
|
|
long long int now = time_msec();
|
|
|
|
storage->next_snapshot_min = now + base + random_range(range);
|
ovsdb: storage: Randomize should_snapshot checks when the minimum time passed.
Snapshots are scheduled for every 10-20 minutes. It's a random value
in this interval for each server. Once the time is up, but the maximum
time (24 hours) not reached yet, ovsdb will start checking if the log
grew a lot on every iteration. Once the growth is detected, compaction
is triggered.
OTOH, it's very common for an OVSDB cluster to not have the log growing
very fast. If the log didn't grow 2x in 20 minutes, the randomness of
the initial scheduled time is gone and all the servers are checking if
they need to create snapshot on every iteration. And since all of them
are part of the same cluster, their logs are growing with the same
speed. Once the critical mass is reached, all the servers will start
creating snapshots at the same time. If the database is big enough,
that might leave the cluster unresponsive for an extended period of
time (e.g. 10-15 seconds for OVN_Southbound database in a larger scale
OVN deployment) until the compaction completed.
Fix that by re-scheduling a quick retry if the minimal time already
passed. Effectively, this will work as a randomized 1-2 min delay
between checks, so the servers will not synchronize.
Scheduling function updated to not change the upper limit on quick
reschedules to avoid delaying the snapshot creation indefinitely.
Currently quick re-schedules are only used for the error cases, and
there is always a 'slow' re-schedule after the successful compaction.
So, the change of a scheduling function doesn't change the current
behavior much.
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Han Zhou <hzhou@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-12-13 16:43:33 +01:00
|
|
|
if (!quick) {
|
|
|
|
long long int one_day = 60LL * 60 * 24 * 1000;
|
|
|
|
|
|
|
|
storage->next_snapshot_max = now + one_day;
|
|
|
|
}
|
2017-12-31 21:15:58 -08:00
|
|
|
} else {
|
|
|
|
storage->next_snapshot_min = LLONG_MAX;
|
|
|
|
storage->next_snapshot_max = LLONG_MAX;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
ovsdb: storage: Randomize should_snapshot checks when the minimum time passed.
Snapshots are scheduled for every 10-20 minutes. It's a random value
in this interval for each server. Once the time is up, but the maximum
time (24 hours) not reached yet, ovsdb will start checking if the log
grew a lot on every iteration. Once the growth is detected, compaction
is triggered.
OTOH, it's very common for an OVSDB cluster to not have the log growing
very fast. If the log didn't grow 2x in 20 minutes, the randomness of
the initial scheduled time is gone and all the servers are checking if
they need to create snapshot on every iteration. And since all of them
are part of the same cluster, their logs are growing with the same
speed. Once the critical mass is reached, all the servers will start
creating snapshots at the same time. If the database is big enough,
that might leave the cluster unresponsive for an extended period of
time (e.g. 10-15 seconds for OVN_Southbound database in a larger scale
OVN deployment) until the compaction completed.
Fix that by re-scheduling a quick retry if the minimal time already
passed. Effectively, this will work as a randomized 1-2 min delay
between checks, so the servers will not synchronize.
Scheduling function updated to not change the upper limit on quick
reschedules to avoid delaying the snapshot creation indefinitely.
Currently quick re-schedules are only used for the error cases, and
there is always a 'slow' re-schedule after the successful compaction.
So, the change of a scheduling function doesn't change the current
behavior much.
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Han Zhou <hzhou@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-12-13 16:43:33 +01:00
|
|
|
ovsdb_storage_should_snapshot(struct ovsdb_storage *storage)
|
2017-12-31 21:15:58 -08:00
|
|
|
{
|
|
|
|
if (storage->raft || storage->log) {
|
|
|
|
/* If we haven't reached the minimum snapshot time, don't snapshot. */
|
|
|
|
long long int now = time_msec();
|
|
|
|
if (now < storage->next_snapshot_min) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t log_len = (storage->raft
|
|
|
|
? raft_get_log_length(storage->raft)
|
|
|
|
: storage->n_read + storage->n_written);
|
raft: Transfer leadership before creating snapshots.
With a big database writing snapshot could take a lot of time, for
example, on one of the systems compaction of 300MB database takes
about 10 seconds to complete. For the clustered database, 40% of this
time takes conversion of the database to the file transaction json
format, the rest of time is formatting a string and writing to disk.
Of course, this highly depends on the disc and CPU speeds. 300MB is
the very possible database size for the OVN Southbound DB, and it might
be even bigger than that.
During compaction the database is not available and the ovsdb-server
doesn't do any other tasks. If leader spends 10-15 seconds writing a
snapshot, the cluster is not functional for that time period. Leader
also, likely, has some monitors to serve, so the one poll interval may
be 15-20 seconds long in the end. Systems with so big databases
typically has very high election timers configured (16 seconds), so
followers will start election only after this significant amount of
time. Once leader is back to the operational state, it will
re-connect and try to join the cluster back. In some cases, this might
also trigger the 'connected' state flapping on the old leader
triggering a re-connection of clients. This issue has been observed
with large-scale OVN deployments.
One of the methods to improve the situation is to transfer leadership
before compacting. This allows to keep the cluster functional,
while one of the servers writes a snapshot.
Additionally logging the time spent for compaction if it was longer
than 1 second. This adds a bit of visibility to 'unreasonably long
poll interval's.
Reported-at: https://bugzilla.redhat.com/1960391
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-05-06 14:47:31 +02:00
|
|
|
bool snapshot_recommended = false;
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
if (now < storage->next_snapshot_max) {
|
|
|
|
/* Maximum snapshot time not yet reached. Take a snapshot if there
|
|
|
|
* have been at least 100 log entries and the log file size has
|
|
|
|
* grown a lot. */
|
|
|
|
bool grew_lots = (storage->raft
|
|
|
|
? raft_grew_lots(storage->raft)
|
|
|
|
: ovsdb_log_grew_lots(storage->log));
|
raft: Transfer leadership before creating snapshots.
With a big database writing snapshot could take a lot of time, for
example, on one of the systems compaction of 300MB database takes
about 10 seconds to complete. For the clustered database, 40% of this
time takes conversion of the database to the file transaction json
format, the rest of time is formatting a string and writing to disk.
Of course, this highly depends on the disc and CPU speeds. 300MB is
the very possible database size for the OVN Southbound DB, and it might
be even bigger than that.
During compaction the database is not available and the ovsdb-server
doesn't do any other tasks. If leader spends 10-15 seconds writing a
snapshot, the cluster is not functional for that time period. Leader
also, likely, has some monitors to serve, so the one poll interval may
be 15-20 seconds long in the end. Systems with so big databases
typically has very high election timers configured (16 seconds), so
followers will start election only after this significant amount of
time. Once leader is back to the operational state, it will
re-connect and try to join the cluster back. In some cases, this might
also trigger the 'connected' state flapping on the old leader
triggering a re-connection of clients. This issue has been observed
with large-scale OVN deployments.
One of the methods to improve the situation is to transfer leadership
before compacting. This allows to keep the cluster functional,
while one of the servers writes a snapshot.
Additionally logging the time spent for compaction if it was longer
than 1 second. This adds a bit of visibility to 'unreasonably long
poll interval's.
Reported-at: https://bugzilla.redhat.com/1960391
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-05-06 14:47:31 +02:00
|
|
|
snapshot_recommended = (log_len >= 100 && grew_lots);
|
2017-12-31 21:15:58 -08:00
|
|
|
} else {
|
|
|
|
/* We have reached the maximum snapshot time. Take a snapshot if
|
|
|
|
* there have been any log entries at all. */
|
raft: Transfer leadership before creating snapshots.
With a big database writing snapshot could take a lot of time, for
example, on one of the systems compaction of 300MB database takes
about 10 seconds to complete. For the clustered database, 40% of this
time takes conversion of the database to the file transaction json
format, the rest of time is formatting a string and writing to disk.
Of course, this highly depends on the disc and CPU speeds. 300MB is
the very possible database size for the OVN Southbound DB, and it might
be even bigger than that.
During compaction the database is not available and the ovsdb-server
doesn't do any other tasks. If leader spends 10-15 seconds writing a
snapshot, the cluster is not functional for that time period. Leader
also, likely, has some monitors to serve, so the one poll interval may
be 15-20 seconds long in the end. Systems with so big databases
typically has very high election timers configured (16 seconds), so
followers will start election only after this significant amount of
time. Once leader is back to the operational state, it will
re-connect and try to join the cluster back. In some cases, this might
also trigger the 'connected' state flapping on the old leader
triggering a re-connection of clients. This issue has been observed
with large-scale OVN deployments.
One of the methods to improve the situation is to transfer leadership
before compacting. This allows to keep the cluster functional,
while one of the servers writes a snapshot.
Additionally logging the time spent for compaction if it was longer
than 1 second. This adds a bit of visibility to 'unreasonably long
poll interval's.
Reported-at: https://bugzilla.redhat.com/1960391
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-05-06 14:47:31 +02:00
|
|
|
snapshot_recommended = (log_len > 0);
|
2017-12-31 21:15:58 -08:00
|
|
|
}
|
raft: Transfer leadership before creating snapshots.
With a big database writing snapshot could take a lot of time, for
example, on one of the systems compaction of 300MB database takes
about 10 seconds to complete. For the clustered database, 40% of this
time takes conversion of the database to the file transaction json
format, the rest of time is formatting a string and writing to disk.
Of course, this highly depends on the disc and CPU speeds. 300MB is
the very possible database size for the OVN Southbound DB, and it might
be even bigger than that.
During compaction the database is not available and the ovsdb-server
doesn't do any other tasks. If leader spends 10-15 seconds writing a
snapshot, the cluster is not functional for that time period. Leader
also, likely, has some monitors to serve, so the one poll interval may
be 15-20 seconds long in the end. Systems with so big databases
typically has very high election timers configured (16 seconds), so
followers will start election only after this significant amount of
time. Once leader is back to the operational state, it will
re-connect and try to join the cluster back. In some cases, this might
also trigger the 'connected' state flapping on the old leader
triggering a re-connection of clients. This issue has been observed
with large-scale OVN deployments.
One of the methods to improve the situation is to transfer leadership
before compacting. This allows to keep the cluster functional,
while one of the servers writes a snapshot.
Additionally logging the time spent for compaction if it was longer
than 1 second. This adds a bit of visibility to 'unreasonably long
poll interval's.
Reported-at: https://bugzilla.redhat.com/1960391
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-05-06 14:47:31 +02:00
|
|
|
|
|
|
|
if (!snapshot_recommended) {
|
ovsdb: storage: Randomize should_snapshot checks when the minimum time passed.
Snapshots are scheduled for every 10-20 minutes. It's a random value
in this interval for each server. Once the time is up, but the maximum
time (24 hours) not reached yet, ovsdb will start checking if the log
grew a lot on every iteration. Once the growth is detected, compaction
is triggered.
OTOH, it's very common for an OVSDB cluster to not have the log growing
very fast. If the log didn't grow 2x in 20 minutes, the randomness of
the initial scheduled time is gone and all the servers are checking if
they need to create snapshot on every iteration. And since all of them
are part of the same cluster, their logs are growing with the same
speed. Once the critical mass is reached, all the servers will start
creating snapshots at the same time. If the database is big enough,
that might leave the cluster unresponsive for an extended period of
time (e.g. 10-15 seconds for OVN_Southbound database in a larger scale
OVN deployment) until the compaction completed.
Fix that by re-scheduling a quick retry if the minimal time already
passed. Effectively, this will work as a randomized 1-2 min delay
between checks, so the servers will not synchronize.
Scheduling function updated to not change the upper limit on quick
reschedules to avoid delaying the snapshot creation indefinitely.
Currently quick re-schedules are only used for the error cases, and
there is always a 'slow' re-schedule after the successful compaction.
So, the change of a scheduling function doesn't change the current
behavior much.
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Han Zhou <hzhou@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-12-13 16:43:33 +01:00
|
|
|
if (storage->raft) {
|
|
|
|
/* Re-scheduling with a quick retry in order to avoid condition
|
|
|
|
* where all the raft servers passed the minimal time already,
|
|
|
|
* but the log didn't grow a lot, so they are all checking on
|
|
|
|
* every iteration. This will randomize the time of the next
|
|
|
|
* attempt, so all the servers will not start snapshotting at
|
|
|
|
* the same time when the log reaches a critical size. */
|
|
|
|
schedule_next_snapshot(storage, true);
|
|
|
|
}
|
raft: Transfer leadership before creating snapshots.
With a big database writing snapshot could take a lot of time, for
example, on one of the systems compaction of 300MB database takes
about 10 seconds to complete. For the clustered database, 40% of this
time takes conversion of the database to the file transaction json
format, the rest of time is formatting a string and writing to disk.
Of course, this highly depends on the disc and CPU speeds. 300MB is
the very possible database size for the OVN Southbound DB, and it might
be even bigger than that.
During compaction the database is not available and the ovsdb-server
doesn't do any other tasks. If leader spends 10-15 seconds writing a
snapshot, the cluster is not functional for that time period. Leader
also, likely, has some monitors to serve, so the one poll interval may
be 15-20 seconds long in the end. Systems with so big databases
typically has very high election timers configured (16 seconds), so
followers will start election only after this significant amount of
time. Once leader is back to the operational state, it will
re-connect and try to join the cluster back. In some cases, this might
also trigger the 'connected' state flapping on the old leader
triggering a re-connection of clients. This issue has been observed
with large-scale OVN deployments.
One of the methods to improve the situation is to transfer leadership
before compacting. This allows to keep the cluster functional,
while one of the servers writes a snapshot.
Additionally logging the time spent for compaction if it was longer
than 1 second. This adds a bit of visibility to 'unreasonably long
poll interval's.
Reported-at: https://bugzilla.redhat.com/1960391
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-05-06 14:47:31 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If we can't snapshot right now, don't. */
|
|
|
|
if (storage->raft && !raft_may_snapshot(storage->raft)) {
|
|
|
|
/* Notifying the storage that it needs to make a snapshot soon. */
|
|
|
|
raft_notify_snapshot_recommended(storage->raft);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2017-12-31 21:15:58 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_store_snapshot__(struct ovsdb_storage *storage,
|
|
|
|
const struct json *schema,
|
|
|
|
const struct json *data)
|
|
|
|
{
|
|
|
|
if (storage->raft) {
|
|
|
|
struct json *entries = json_array_create_empty();
|
|
|
|
if (schema) {
|
|
|
|
json_array_add(entries, json_clone(schema));
|
|
|
|
}
|
|
|
|
if (data) {
|
|
|
|
json_array_add(entries, json_clone(data));
|
|
|
|
}
|
|
|
|
struct ovsdb_error *error = raft_store_snapshot(storage->raft,
|
|
|
|
entries);
|
|
|
|
json_destroy(entries);
|
|
|
|
return error;
|
|
|
|
} else if (storage->log) {
|
|
|
|
struct json *entries[2];
|
|
|
|
size_t n = 0;
|
|
|
|
if (schema) {
|
|
|
|
entries[n++] = CONST_CAST(struct json *, schema);
|
|
|
|
}
|
|
|
|
if (data) {
|
|
|
|
entries[n++] = CONST_CAST(struct json *, data);
|
|
|
|
}
|
|
|
|
return ovsdb_log_replace(storage->log, entries, n);
|
|
|
|
} else {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 'schema' and 'data' should faithfully represent the current schema and data,
|
|
|
|
* otherwise the two storing backing formats will yield divergent results. Use
|
|
|
|
* ovsdb_storage_write_schema_change() to change the schema. */
|
|
|
|
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_store_snapshot(struct ovsdb_storage *storage,
|
|
|
|
const struct json *schema,
|
|
|
|
const struct json *data)
|
|
|
|
{
|
|
|
|
struct ovsdb_error *error = ovsdb_storage_store_snapshot__(storage,
|
|
|
|
schema, data);
|
|
|
|
bool retry_quickly = error != NULL;
|
|
|
|
schedule_next_snapshot(storage, retry_quickly);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ovsdb_write * OVS_WARN_UNUSED_RESULT
|
|
|
|
ovsdb_storage_write_schema_change(struct ovsdb_storage *storage,
|
|
|
|
const struct json *schema,
|
|
|
|
const struct json *data,
|
|
|
|
const struct uuid *prereq,
|
|
|
|
struct uuid *resultp)
|
|
|
|
{
|
|
|
|
struct ovsdb_write *w = xzalloc(sizeof *w);
|
|
|
|
struct uuid result = UUID_ZERO;
|
|
|
|
if (storage->error) {
|
|
|
|
w->error = ovsdb_error_clone(storage->error);
|
|
|
|
} else if (storage->raft) {
|
|
|
|
struct json *txn_json = json_array_create_2(json_clone(schema),
|
|
|
|
json_clone(data));
|
|
|
|
w->command = raft_command_execute(storage->raft, txn_json,
|
|
|
|
prereq, &result);
|
|
|
|
json_destroy(txn_json);
|
|
|
|
} else if (storage->log) {
|
|
|
|
w->error = ovsdb_storage_store_snapshot__(storage, schema, data);
|
|
|
|
} else {
|
|
|
|
/* When 'error' and 'command' are both null, it indicates that the
|
|
|
|
* command is complete. This is fine since this unbacked storage drops
|
|
|
|
* writes. */
|
|
|
|
}
|
|
|
|
if (resultp) {
|
|
|
|
*resultp = result;
|
|
|
|
}
|
|
|
|
return w;
|
|
|
|
}
|
ovsdb raft: Precheck prereq before proposing commit.
In current OVSDB Raft design, when there are multiple transactions
pending, either from same server node or different nodes in the
cluster, only the first one can be successful at once, and following
ones will fail at the prerequisite check on leader node, because
the first one will update the expected prerequisite eid on leader
node, and the prerequisite used for proposing a commit has to be
committed eid, so it is not possible for a node to use the latest
prerequisite expected by the leader to propose a commit until the
lastest transaction is committed by the leader and updated the
committed_index on the node.
Current implementation proposes the commit as soon as the transaction
is requested by the client, which results in continously retry which
causes high CPU load and waste.
Particularly, even if all clients are using leader_only to connect to
only the leader, the prereq check failure still happens a lot when
a batch of transactions are pending on the leader node - the leader
node proposes a batch of commits using the same committed eid as
prerequisite and it updates the expected prereq as soon as the first
one is in progress, but it needs time to append to followers and wait
until majority replies to update the committed_index, which results in
continously useless retries of the following transactions proposed by
the leader itself.
This patch doesn't change the design but simplely pre-checks if current
eid is same as prereq, before proposing the commit, to avoid waste of
CPU cycles, for both leader and followers. When clients use leader_only
mode, this patch completely eliminates the prereq check failures.
In scale test of OVN with 1k HVs and creating and binding 10k lports,
the patch resulted in 90% CPU cost reduction on leader and >80% CPU cost
reduction on followers. (The test was with leader election base time
set to 10000ms, because otherwise the test couldn't complete because
of the frequent leader re-election.)
This is just one of the related performance problems of the prereq
checking mechanism dicussed at:
https://mail.openvswitch.org/pipermail/ovs-discuss/2019-February/048243.html
Signed-off-by: Han Zhou <hzhou8@ebay.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-03-01 10:56:37 -08:00
|
|
|
|
|
|
|
const struct uuid *
|
|
|
|
ovsdb_storage_peek_last_eid(struct ovsdb_storage *storage)
|
|
|
|
{
|
|
|
|
if (!storage->raft) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return raft_current_eid(storage->raft);
|
|
|
|
}
|