2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-21 17:37:37 +00:00
ovs/ovsdb/storage.c
Ilya Maximets 1de4a08c22 json: Use functions to access json arrays.
Internal implementation of JSON array will be changed in the future
commits.  Add access functions that users can rely on instead of
accessing the internals of 'struct json' directly and convert all the
users.  Structure fields are intentionally renamed to make sure that
no code is using the old fields directly.

json_array() function is removed, as not needed anymore.  Added new
functions:  json_array_size(), json_array_at(), json_array_set()
and json_array_pop().  These are enough to cover all the use cases
within OVS.

The change is fairly large, however, IMO, it's a much overdue cleanup
that we need even without changing the underlying implementation.

Acked-by: Mike Pattrick <mkp@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-30 16:53:56 +02:00

676 lines
20 KiB
C

/* Copyright (c) 2009, 2010, 2011, 2016, 2017 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this storage except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "storage.h"
#include <string.h>
#include "log.h"
#include "ovsdb-error.h"
#include "openvswitch/json.h"
#include "openvswitch/poll-loop.h"
#include "openvswitch/vlog.h"
#include "ovsdb.h"
#include "raft.h"
#include "random.h"
#include "simap.h"
#include "timeval.h"
#include "util.h"
VLOG_DEFINE_THIS_MODULE(storage);
struct ovsdb_storage {
/* There are three kinds of storage:
*
* - Standalone, backed by a disk file. 'log' is nonnull, 'raft' is
* null.
*
* - Clustered, backed by a Raft cluster. 'log' is null, 'raft' is
* nonnull.
*
* - Memory only, unbacked. 'log' and 'raft' are null. */
struct ovsdb_log *log;
struct raft *raft;
char *unbacked_name; /* Name of the unbacked storage. */
/* All kinds of storage. */
struct ovsdb_error *error; /* If nonnull, a permanent error. */
long long next_snapshot_min; /* Earliest time to take next snapshot. */
long long next_snapshot_max; /* Latest time to take next snapshot. */
/* Standalone only. */
unsigned int n_read;
unsigned int n_written;
};
static void schedule_next_snapshot(struct ovsdb_storage *, bool quick);
static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_storage_open__(const char *filename, bool rw, bool allow_clustered,
struct ovsdb_storage **storagep)
{
*storagep = NULL;
struct ovsdb_log *log;
struct ovsdb_error *error;
error = ovsdb_log_open(filename, OVSDB_MAGIC"|"RAFT_MAGIC,
rw ? OVSDB_LOG_READ_WRITE : OVSDB_LOG_READ_ONLY,
-1, &log);
if (error) {
return error;
}
struct raft *raft = NULL;
if (!strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC)) {
if (!allow_clustered) {
ovsdb_log_close(log);
return ovsdb_error(NULL, "%s: cannot apply this operation to "
"clustered database file", filename);
}
error = raft_open(log, &raft);
log = NULL;
if (error) {
return error;
}
}
struct ovsdb_storage *storage = xzalloc(sizeof *storage);
storage->log = log;
storage->raft = raft;
schedule_next_snapshot(storage, false);
*storagep = storage;
return NULL;
}
/* Opens 'filename' for use as storage. If 'rw', opens it for read/write
* access, otherwise read-only. If successful, stores the new storage in
* '*storagep' and returns NULL; on failure, stores NULL in '*storagep' and
* returns the error.
*
* The returned storage might be clustered or standalone, depending on what the
* disk file contains. */
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_storage_open(const char *filename, bool rw,
struct ovsdb_storage **storagep)
{
return ovsdb_storage_open__(filename, rw, true, storagep);
}
struct ovsdb_storage *
ovsdb_storage_open_standalone(const char *filename, bool rw)
{
struct ovsdb_storage *storage;
struct ovsdb_error *error = ovsdb_storage_open__(filename, rw, false,
&storage);
if (error) {
ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
}
return storage;
}
/* Creates and returns new storage without any backing. Nothing will be read
* from the storage, and writes are discarded. If 'name' is nonnull, it will
* be used as a storage name. */
struct ovsdb_storage *
ovsdb_storage_create_unbacked(const char *name)
{
struct ovsdb_storage *storage = xzalloc(sizeof *storage);
schedule_next_snapshot(storage, false);
storage->unbacked_name = nullable_xstrdup(name);
return storage;
}
void
ovsdb_storage_close(struct ovsdb_storage *storage)
{
if (storage) {
ovsdb_log_close(storage->log);
raft_close(storage->raft);
ovsdb_error_destroy(storage->error);
free(storage->unbacked_name);
free(storage);
}
}
const char *
ovsdb_storage_get_model(const struct ovsdb_storage *storage)
{
return storage->raft ? "clustered" : "standalone";
}
bool
ovsdb_storage_is_clustered(const struct ovsdb_storage *storage)
{
return storage->raft != NULL;
}
bool
ovsdb_storage_is_connected(const struct ovsdb_storage *storage)
{
return !storage->raft || raft_is_connected(storage->raft);
}
bool
ovsdb_storage_is_dead(const struct ovsdb_storage *storage)
{
return storage->raft && raft_left(storage->raft);
}
bool
ovsdb_storage_is_leader(const struct ovsdb_storage *storage)
{
return !storage->raft || raft_is_leader(storage->raft);
}
const struct uuid *
ovsdb_storage_get_cid(const struct ovsdb_storage *storage)
{
return storage->raft ? raft_get_cid(storage->raft) : NULL;
}
const struct uuid *
ovsdb_storage_get_sid(const struct ovsdb_storage *storage)
{
return storage->raft ? raft_get_sid(storage->raft) : NULL;
}
uint64_t
ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage)
{
return storage->raft ? raft_get_applied_index(storage->raft) : 0;
}
void
ovsdb_storage_get_memory_usage(const struct ovsdb_storage *storage,
struct simap *usage)
{
if (storage->raft) {
raft_get_memory_usage(storage->raft, usage);
}
}
char *
ovsdb_storage_get_error(const struct ovsdb_storage *storage)
{
if (storage->error) {
return ovsdb_error_to_string(storage->error);
}
return NULL;
}
void
ovsdb_storage_run(struct ovsdb_storage *storage)
{
if (storage->raft) {
raft_run(storage->raft);
}
}
void
ovsdb_storage_wait(struct ovsdb_storage *storage)
{
if (storage->raft) {
raft_wait(storage->raft);
}
}
/* Returns 'storage''s embedded name, if it has one, otherwise null.
*
* Only clustered storage has a built-in name. */
const char *
ovsdb_storage_get_name(const struct ovsdb_storage *storage)
{
return storage->unbacked_name ? storage->unbacked_name
: storage->raft ? raft_get_name(storage->raft)
: NULL;
}
/* Attempts to read a log record from 'storage'.
*
* If successful, returns NULL and stores the transaction information in
* '*schemap', '*txnp', and '*txnid'. At least one of these will be nonnull.
* The caller owns the data and must eventually free it (with json_destroy()).
*
* If 'storage' is not clustered, 'txnid' may be null.
*
* If a read error occurs, returns the error and stores NULL in '*jsonp'.
*
* If the read reaches end of file, returns NULL and stores NULL in
* '*jsonp'. */
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_storage_read(struct ovsdb_storage *storage,
struct ovsdb_schema **schemap,
struct json **txnp,
struct uuid *txnid)
{
*schemap = NULL;
*txnp = NULL;
if (txnid) {
*txnid = UUID_ZERO;
}
const struct json *schema_json = NULL;
const struct json *txn_json = NULL;
struct json *json;
if (storage->raft) {
json = raft_next_entry(storage->raft, txnid);
if (!json) {
return NULL;
} else if (json->type != JSON_ARRAY || json_array_size(json) != 2) {
json_destroy(json);
return ovsdb_error(NULL, "invalid commit format");
}
const struct json *e0 = json_array_at(json, 0);
const struct json *e1 = json_array_at(json, 1);
schema_json = e0->type != JSON_NULL ? e0 : NULL;
txn_json = e1->type != JSON_NULL ? e1 : NULL;
} else if (storage->log) {
struct ovsdb_error *error = ovsdb_log_read(storage->log, &json);
if (error || !json) {
return error;
}
unsigned int n = storage->n_read++;
const struct json **jsonp = !n ? &schema_json : &txn_json;
*jsonp = json;
if (n == 1) {
ovsdb_log_mark_base(storage->log);
}
} else {
/* Unbacked. Nothing to do. */
return NULL;
}
/* If we got this far then we must have at least a schema or a
* transaction. */
ovs_assert(schema_json || txn_json);
if (schema_json) {
struct ovsdb_schema *schema;
struct ovsdb_error *error = ovsdb_schema_from_json(schema_json,
&schema);
if (error) {
json_destroy(json);
return error;
}
const char *storage_name = ovsdb_storage_get_name(storage);
const char *schema_name = schema->name;
if (storage_name && strcmp(storage_name, schema_name)) {
error = ovsdb_error(NULL, "name %s in header does not match "
"name %s in schema",
storage_name, schema_name);
json_destroy(json);
ovsdb_schema_destroy(schema);
return error;
}
*schemap = schema;
}
if (txn_json) {
*txnp = json_clone(txn_json);
}
json_destroy(json);
return NULL;
}
/* Reads and returns the schema from standalone storage 'storage'. Terminates
* with an error on failure. */
struct ovsdb_schema *
ovsdb_storage_read_schema(struct ovsdb_storage *storage)
{
ovs_assert(storage->log);
struct json *txn_json;
struct ovsdb_schema *schema;
struct ovsdb_error *error = ovsdb_storage_read(storage, &schema,
&txn_json, NULL);
if (error) {
ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
}
if (!schema && !txn_json) {
ovs_fatal(0, "unexpected end of file reading schema");
}
ovs_assert(schema && !txn_json);
return schema;
}
bool
ovsdb_storage_read_wait(struct ovsdb_storage *storage)
{
return (storage->raft
? raft_has_next_entry(storage->raft)
: false);
}
void
ovsdb_storage_unread(struct ovsdb_storage *storage)
{
if (storage->error) {
return;
}
if (storage->raft) {
if (!storage->error) {
storage->error = ovsdb_error(NULL, "inconsistent data");
}
} else if (storage->log) {
ovsdb_log_unread(storage->log);
}
}
struct ovsdb_write {
struct ovsdb_error *error;
struct raft_command *command;
};
/* Not suitable for writing transactions that change the schema. */
struct ovsdb_write * OVS_WARN_UNUSED_RESULT
ovsdb_storage_write(struct ovsdb_storage *storage, const struct json *data,
const struct uuid *prereq, struct uuid *resultp,
bool durable)
{
struct ovsdb_write *w = xzalloc(sizeof *w);
struct uuid result = UUID_ZERO;
if (storage->error) {
w->error = ovsdb_error_clone(storage->error);
} else if (storage->raft) {
struct json *txn_json = json_array_create_2(json_null_create(),
json_clone(data));
w->command = raft_command_execute(storage->raft, txn_json,
prereq, &result);
json_destroy(txn_json);
} else if (storage->log) {
w->error = ovsdb_log_write(storage->log, data);
if (!w->error) {
storage->n_written++;
if (durable) {
w->error = ovsdb_log_commit_block(storage->log);
}
}
} else {
/* When 'error' and 'command' are both null, it indicates that the
* command is complete. This is fine since this unbacked storage drops
* writes. */
}
if (resultp) {
*resultp = result;
}
return w;
}
/* Not suitable for writing transactions that change the schema. */
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_storage_write_block(struct ovsdb_storage *storage,
const struct json *data, const struct uuid *prereq,
struct uuid *resultp, bool durable)
{
struct ovsdb_write *w = ovsdb_storage_write(storage, data,
prereq, resultp, durable);
while (!ovsdb_write_is_complete(w)) {
if (storage->raft) {
raft_run(storage->raft);
}
ovsdb_write_wait(w);
if (storage->raft) {
raft_wait(storage->raft);
}
poll_block();
}
struct ovsdb_error *error = ovsdb_error_clone(ovsdb_write_get_error(w));
ovsdb_write_destroy(w);
return error;
}
bool
ovsdb_write_is_complete(const struct ovsdb_write *w)
{
return (w->error
|| !w->command
|| raft_command_get_status(w->command) != RAFT_CMD_INCOMPLETE);
}
const struct ovsdb_error *
ovsdb_write_get_error(const struct ovsdb_write *w_)
{
struct ovsdb_write *w = CONST_CAST(struct ovsdb_write *, w_);
ovs_assert(ovsdb_write_is_complete(w));
if (w->command && !w->error) {
enum raft_command_status status = raft_command_get_status(w->command);
if (status != RAFT_CMD_SUCCESS) {
w->error = ovsdb_error("cluster error", "%s",
raft_command_status_to_string(status));
}
}
return w->error;
}
uint64_t
ovsdb_write_get_commit_index(const struct ovsdb_write *w)
{
ovs_assert(ovsdb_write_is_complete(w));
return (w->command && !w->error
? raft_command_get_commit_index(w->command)
: 0);
}
void
ovsdb_write_wait(const struct ovsdb_write *w)
{
if (ovsdb_write_is_complete(w)) {
poll_immediate_wake();
}
}
void
ovsdb_write_destroy(struct ovsdb_write *w)
{
if (w) {
raft_command_unref(w->command);
ovsdb_error_destroy(w->error);
free(w);
}
}
static void
schedule_next_snapshot(struct ovsdb_storage *storage, bool quick)
{
if (storage->log || storage->raft) {
unsigned int base = 10 * 60 * 1000; /* 10 minutes */
unsigned int range = 10 * 60 * 1000; /* 10 minutes */
if (quick) {
base /= 10;
range /= 10;
}
long long int now = time_msec();
storage->next_snapshot_min = now + base + random_range(range);
if (!quick) {
long long int one_day = 60LL * 60 * 24 * 1000;
storage->next_snapshot_max = now + one_day;
}
} else {
storage->next_snapshot_min = LLONG_MAX;
storage->next_snapshot_max = LLONG_MAX;
}
}
bool
ovsdb_storage_should_snapshot(struct ovsdb_storage *storage)
{
if (storage->raft || storage->log) {
/* If we haven't reached the minimum snapshot time, don't snapshot. */
long long int now = time_msec();
if (now < storage->next_snapshot_min) {
return false;
}
uint64_t log_len = (storage->raft
? raft_get_log_length(storage->raft)
: storage->n_read + storage->n_written);
bool snapshot_recommended = false;
if (now < storage->next_snapshot_max) {
/* Maximum snapshot time not yet reached. Take a snapshot if there
* have been at least 100 log entries and the log file size has
* grown a lot. */
bool grew_lots = (storage->raft
? raft_grew_lots(storage->raft)
: ovsdb_log_grew_lots(storage->log));
snapshot_recommended = (log_len >= 100 && grew_lots);
} else {
/* We have reached the maximum snapshot time. Take a snapshot if
* there have been any log entries at all. */
snapshot_recommended = (log_len > 0);
}
if (!snapshot_recommended) {
if (storage->raft) {
/* Re-scheduling with a quick retry in order to avoid condition
* where all the raft servers passed the minimal time already,
* but the log didn't grow a lot, so they are all checking on
* every iteration. This will randomize the time of the next
* attempt, so all the servers will not start snapshotting at
* the same time when the log reaches a critical size. */
schedule_next_snapshot(storage, true);
}
return false;
}
/* If we can't snapshot right now, don't. */
if (storage->raft && !raft_may_snapshot(storage->raft)) {
/* Notifying the storage that it needs to make a snapshot soon. */
raft_notify_snapshot_recommended(storage->raft);
return false;
}
return true;
}
return false;
}
static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_storage_store_snapshot__(struct ovsdb_storage *storage,
const struct json *schema,
const struct json *data, uint64_t index)
{
if (storage->raft) {
struct json *entries = json_array_create_empty();
if (schema) {
json_array_add(entries, json_clone(schema));
}
if (data) {
json_array_add(entries, json_clone(data));
}
struct ovsdb_error *error = raft_store_snapshot(storage->raft,
entries, index);
json_destroy(entries);
return error;
} else if (storage->log) {
struct json *entries[2];
size_t n = 0;
if (schema) {
entries[n++] = CONST_CAST(struct json *, schema);
}
if (data) {
entries[n++] = CONST_CAST(struct json *, data);
}
return ovsdb_log_replace(storage->log, entries, n);
} else {
return NULL;
}
}
/* 'schema' and 'data' should faithfully represent the current schema and data,
* otherwise the two storing backing formats will yield divergent results. Use
* ovsdb_storage_write_schema_change() to change the schema. */
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_storage_store_snapshot(struct ovsdb_storage *storage,
const struct json *schema,
const struct json *data, uint64_t index)
{
struct ovsdb_error *error = ovsdb_storage_store_snapshot__(storage,
schema, data,
index);
bool retry_quickly = error != NULL;
schedule_next_snapshot(storage, retry_quickly);
return error;
}
struct ovsdb_write * OVS_WARN_UNUSED_RESULT
ovsdb_storage_write_schema_change(struct ovsdb_storage *storage,
const struct ovsdb_schema *schema,
const struct json *data,
const struct uuid *prereq,
struct uuid *resultp)
{
struct ovsdb_write *w = xzalloc(sizeof *w);
struct uuid result = UUID_ZERO;
if (storage->error) {
w->error = ovsdb_error_clone(storage->error);
} else if (storage->raft) {
/* Clustered storage doesn't support ephemeral columns. */
w->error = ovsdb_schema_check_for_ephemeral_columns(schema);
if (!w->error) {
struct json *schema_json, *txn_json;
schema_json = ovsdb_schema_to_json(schema);
txn_json = json_array_create_2(schema_json, json_clone(data));
w->command = raft_command_execute(storage->raft, txn_json,
prereq, &result);
json_destroy(txn_json);
}
} else if (storage->log) {
struct json *schema_json = ovsdb_schema_to_json(schema);
w->error = ovsdb_storage_store_snapshot__(storage, schema_json,
data, 0);
json_destroy(schema_json);
} else {
/* When 'error' and 'command' are both null, it indicates that the
* command is complete. This is fine since this unbacked storage drops
* writes. */
}
if (resultp) {
*resultp = result;
}
return w;
}
bool
ovsdb_storage_precheck_prereq(const struct ovsdb_storage *storage,
const struct uuid *prereq)
{
if (!storage->raft) {
return true;
}
return raft_precheck_prereq(storage->raft, prereq);
}