ovs/ovsdb/file.c

/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2016, 2017 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>

#include "file.h"

#include <errno.h>
#include <fcntl.h>
#include <unistd.h>

#include "bitmap.h"
#include "column.h"
#include "log.h"
#include "openvswitch/json.h"
#include "lockfile.h"
#include "ovsdb.h"
#include "ovsdb-error.h"
#include "row.h"
#include "socket-util.h"
#include "storage.h"
#include "table.h"
#include "timeval.h"
#include "transaction.h"
#include "unixctl.h"
#include "uuid.h"
#include "util.h"
#include "openvswitch/vlog.h"

VLOG_DEFINE_THIS_MODULE(ovsdb_file);

/* A transaction being converted to JSON for writing to a file. */
struct ovsdb_file_txn {
    struct json *json;          /* JSON for the whole transaction. */
    struct json *table_json;    /* JSON for 'table''s transaction. */
    struct ovsdb_table *table;  /* Table described in 'table_json'.  */
};

static void ovsdb_file_txn_init(struct ovsdb_file_txn *);
static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *,
                                   const struct ovsdb_row *old,
                                   const struct ovsdb_row *new,
                                   const unsigned long int *changed,
                                   bool allow_shallow_copies);

/* If set to 'true', file transactions will contain difference between
 * datums of old and new rows and not the whole new datum for the column. */
static bool use_column_diff = true;

static void
ovsdb_file_column_diff_enable(struct unixctl_conn *conn, int argc OVS_UNUSED,
                              const char *argv[] OVS_UNUSED,
                              void *arg OVS_UNUSED)
{
    use_column_diff = true;
    unixctl_command_reply(conn, NULL);
}

void
ovsdb_file_column_diff_disable(void)
{
    if (!use_column_diff) {
        return;
    }
    use_column_diff = false;
    unixctl_command_register("ovsdb/file/column-diff-enable", "",
                             0, 0, ovsdb_file_column_diff_enable, NULL);
}

static struct ovsdb_error *
ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting,
                                bool row_contains_diff,
                                const struct json *json)
{
    struct ovsdb_table_schema *schema = row->table->schema;
    struct ovsdb_error *error;
    struct shash_node *node;

    if (json->type != JSON_OBJECT) {
        return ovsdb_syntax_error(json, NULL, "row must be JSON object");
    }

    SHASH_FOR_EACH (node, json_object(json)) {
        const char *column_name = node->name;
        const struct ovsdb_column *column;
        struct ovsdb_datum datum;

        column = ovsdb_table_schema_get_column(schema, column_name);
        if (!column) {
            if (converting) {
                continue;
            }
            return ovsdb_syntax_error(json, "unknown column",
                                      "No column %s in table %s.",
                                      column_name, schema->name);
        }

        if (row_contains_diff) {
            /* Diff may violate the type size rules. */
            error = ovsdb_transient_datum_from_json(&datum, &column->type,
                                                    node->data);
        } else {
            error = ovsdb_datum_from_json(&datum, &column->type,
                                          node->data, NULL);
        }
        if (error) {
            return error;
        }
        if (row_contains_diff
            && !ovsdb_datum_is_default(&row->fields[column->index],
                                       &column->type)) {
            error = ovsdb_datum_apply_diff_in_place(
                                           &row->fields[column->index],
                                           &datum, &column->type);
            ovsdb_datum_destroy(&datum, &column->type);
            if (error) {
                return error;
            }
        } else {
            ovsdb_datum_swap(&row->fields[column->index], &datum);
            ovsdb_datum_destroy(&datum, &column->type);
        }
    }

    return NULL;
}

static struct ovsdb_error *
ovsdb_file_txn_row_from_json(struct ovsdb_txn *txn, struct ovsdb_table *table,
                             bool converting, bool row_contains_diff,
                             const struct uuid *row_uuid, struct json *json)
{
    const struct ovsdb_row *row = ovsdb_table_get_row(table, row_uuid);
    if (json->type == JSON_NULL) {
        if (!row) {
            return ovsdb_syntax_error(NULL, NULL, "transaction deletes "
                                      "row "UUID_FMT" that does not exist",
                                      UUID_ARGS(row_uuid));
        }
        ovsdb_txn_row_delete(txn, row);
        return NULL;
    } else if (row) {
        return ovsdb_file_update_row_from_json(ovsdb_txn_row_modify(txn, row),
                                               converting, row_contains_diff,
                                               json);
    } else {
        struct ovsdb_error *error;
        struct ovsdb_row *new;

        new = ovsdb_row_create(table);
        *ovsdb_row_get_uuid_rw(new) = *row_uuid;
        error = ovsdb_file_update_row_from_json(new, converting,
                                                row_contains_diff, json);
        if (error) {
            ovsdb_row_destroy(new);
        } else {
            ovsdb_txn_row_insert(txn, new);
        }
        return error;
    }
}

static struct ovsdb_error *
ovsdb_file_txn_table_from_json(struct ovsdb_txn *txn,
                               struct ovsdb_table *table,
                               bool converting,
                               bool row_contains_diff,
                               struct json *json)
{
    struct shash_node *node;

    if (json->type != JSON_OBJECT) {
        return ovsdb_syntax_error(json, NULL, "object expected");
    }

    SHASH_FOR_EACH (node, json->object) {
        const char *uuid_string = node->name;
        struct json *txn_row_json = node->data;
        struct ovsdb_error *error;
        struct uuid row_uuid;

        if (!uuid_from_string(&row_uuid, uuid_string)) {
            return ovsdb_syntax_error(json, NULL, "\"%s\" is not a valid UUID",
                                      uuid_string);
        }

        error = ovsdb_file_txn_row_from_json(txn, table, converting,
                                             row_contains_diff,
                                             &row_uuid, txn_row_json);
        if (error) {
            return error;
        }
    }

    return NULL;
}

/* Converts 'json' to an ovsdb_txn for 'db', storing the new transaction in
 * '*txnp'.  Returns NULL if successful, otherwise an error.
 *
 * If 'converting' is true, then unknown table and column names are ignored
 * (which can ease upgrading and downgrading schemas); otherwise, they are
 * treated as errors. */
struct ovsdb_error *
ovsdb_file_txn_from_json(struct ovsdb *db, const struct json *json,
                         bool converting, struct ovsdb_txn **txnp)
{
    struct ovsdb_error *error;
    struct shash_node *node;
    struct ovsdb_txn *txn;

    *txnp = NULL;

    if (json->type != JSON_OBJECT) {
        return ovsdb_syntax_error(json, NULL, "object expected");
    }

    struct json *is_diff = shash_find_data(json->object, "_is_diff");
    bool row_contains_diff = false;

    if (is_diff && is_diff->type == JSON_TRUE) {
        row_contains_diff = true;
    }

    txn = ovsdb_txn_create(db);
    SHASH_FOR_EACH (node, json->object) {
        const char *table_name = node->name;
        struct json *node_json = node->data;
        struct ovsdb_table *table;

        table = shash_find_data(&db->tables, table_name);
        if (!table) {
            if (!strcmp(table_name, "_date")
                && node_json->type == JSON_INTEGER) {
                continue;
            } else if (!strcmp(table_name, "_is_diff")
                       && (node_json->type == JSON_TRUE
                           || node_json->type == JSON_FALSE)) {
                continue;
            } else if (!strcmp(table_name, "_comment") || converting) {
                continue;
            }

            error = ovsdb_syntax_error(json, "unknown table",
                                       "No table named %s.", table_name);
            goto error;
        }

        error = ovsdb_file_txn_table_from_json(txn, table, converting,
                                               row_contains_diff, node_json);
        if (error) {
            goto error;
        }
    }
    *txnp = txn;
    return NULL;

error:
    ovsdb_txn_abort(txn);
    return error;
}

static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_convert_table(struct ovsdb_txn *txn,
                    const struct ovsdb_table *src_table,
                    struct ovsdb_table *dst_table)
{
    const struct ovsdb_column **dst_columns;
    struct ovsdb_error *error = NULL;
    const struct ovsdb_row *src_row;
    unsigned long *src_equal;
    struct shash_node *node;
    size_t n_src_columns;

    n_src_columns = shash_count(&src_table->schema->columns);
    src_equal = bitmap_allocate(n_src_columns);
    dst_columns = xzalloc(n_src_columns * sizeof *dst_columns);

    SHASH_FOR_EACH (node, &src_table->schema->columns) {
        const struct ovsdb_column *src_column = node->data;

        if (src_column->index == OVSDB_COL_UUID ||
            src_column->index == OVSDB_COL_VERSION) {
            continue;
        }

        const struct ovsdb_column *dst_column =
            shash_find_data(&dst_table->schema->columns, src_column->name);

        if (!dst_column) {
            continue;
        }

        dst_columns[src_column->index] = dst_column;

        if (ovsdb_type_equals(&src_column->type, &dst_column->type)) {
            bitmap_set1(src_equal, src_column->index);
        }
    }

    HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) {
        struct ovsdb_row *dst_row = ovsdb_row_create(dst_table);
        *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row);

        SHASH_FOR_EACH (node, &src_table->schema->columns) {
            const struct ovsdb_column *src_column = node->data;
            const struct ovsdb_column *dst_column;

            dst_column = dst_columns[src_column->index];
            if (!dst_column) {
                continue;
            }

            ovsdb_datum_destroy(&dst_row->fields[dst_column->index],
                                &dst_column->type);

            if (bitmap_is_set(src_equal, src_column->index)) {
                /* This column didn't change - no need to convert. */
                ovsdb_datum_clone(&dst_row->fields[dst_column->index],
                                  &src_row->fields[src_column->index]);
                continue;
            }

            error = ovsdb_datum_convert(
                &dst_row->fields[dst_column->index], &dst_column->type,
                &src_row->fields[src_column->index], &src_column->type);
            if (error) {
                ovsdb_datum_init_empty(&dst_row->fields[dst_column->index]);
                ovsdb_row_destroy(dst_row);
                goto exit;
            }
        }

        ovsdb_txn_row_insert(txn, dst_row);
    }

exit:
    free(dst_columns);
    bitmap_free(src_equal);
    return error;
}

/* Copies the data in 'src', converts it into the schema specified in
 * 'new_schema', and puts it into a newly created, unbacked database, and
 * stores a pointer to the new database in '*dstp'.  Returns null if
 * successful, otherwise an error; on error, stores NULL in '*dstp'. */
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema,
              struct ovsdb **dstp)
{
    struct ovsdb *dst = ovsdb_create(ovsdb_schema_clone(new_schema),
                                     ovsdb_storage_create_unbacked(NULL));
    struct ovsdb_txn *txn = ovsdb_txn_create(dst);
    struct ovsdb_error *error = NULL;

    struct shash_node *node;
    SHASH_FOR_EACH (node, &src->tables) {
        const char *table_name = node->name;
        struct ovsdb_table *src_table = node->data;
        struct ovsdb_table *dst_table = shash_find_data(&dst->tables,
                                                        table_name);
        if (!dst_table) {
            continue;
        }

        error = ovsdb_convert_table(txn, src_table, dst_table);
        if (error) {
            goto error;
        }
    }

    error = ovsdb_txn_replay_commit(txn);
    if (error) {
        txn = NULL;            /* ovsdb_txn_replay_commit() already aborted. */
        goto error;
    }

    *dstp = dst;
    return NULL;

error:
    ovsdb_destroy(dst);
    if (txn) {
        ovsdb_txn_abort(txn);
    }
    *dstp = NULL;
    return error;
}

static bool
ovsdb_file_change_cb(const struct ovsdb_row *old,
                     const struct ovsdb_row *new,
                     const unsigned long int *changed,
                     void *ftxn_)
{
    struct ovsdb_file_txn *ftxn = ftxn_;
    ovsdb_file_txn_add_row(ftxn, old, new, changed, true);
    return true;
}

/* Converts the database into transaction JSON representation.
 * If 'allow_shallow_copies' is false, makes sure that all the JSON
 * objects in the resulted transaction JSON are separately allocated
 * objects and not shallow clones of JSON objects already existing
 * in the database.  Useful when multiple threads are working on the
 * same database object. */
struct json *
ovsdb_to_txn_json(const struct ovsdb *db, const char *comment,
                  bool allow_shallow_copies)
{
    struct ovsdb_file_txn ftxn;

    ovsdb_file_txn_init(&ftxn);

    struct shash_node *node;
    SHASH_FOR_EACH (node, &db->tables) {
        const struct ovsdb_table *table = node->data;
        const struct ovsdb_row *row;

        HMAP_FOR_EACH (row, hmap_node, &table->rows) {
            ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL,
                                   allow_shallow_copies);
        }
    }

    return ovsdb_file_txn_annotate(ftxn.json, comment);
}

/* Returns 'txn' transformed into the JSON format that is used in OVSDB files.
 * (But the caller must use ovsdb_file_txn_annotate() to add the _comment and
 * _date members.)  If 'txn' doesn't actually change anything, returns NULL */
struct json *
ovsdb_file_txn_to_json(const struct ovsdb_txn *txn)
{
    struct ovsdb_file_txn ftxn;

    ovsdb_file_txn_init(&ftxn);
    ovsdb_txn_for_each_change(txn, ovsdb_file_change_cb, &ftxn);
    return ftxn.json;
}

struct json *
ovsdb_file_txn_annotate(struct json *json, const char *comment)
{
    if (!json) {
        json = json_object_create();
    }
    if (comment) {
        json_object_put_string(json, "_comment", comment);
    }
    if (use_column_diff) {
        json_object_put(json, "_is_diff", json_boolean_create(true));
    }
    json_object_put(json, "_date", json_integer_create(time_wall_msec()));
    return json;
}

static void
ovsdb_file_txn_init(struct ovsdb_file_txn *ftxn)
{
    ftxn->json = NULL;
    ftxn->table_json = NULL;
    ftxn->table = NULL;
}

static void
ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn,
                       const struct ovsdb_row *old,
                       const struct ovsdb_row *new,
                       const unsigned long int *changed,
                       bool allow_shallow_copies)
{
    struct json *row;

    if (!new) {
        row = json_null_create();
    } else {
        struct shash_node *node;

        row = old ? NULL : json_object_create();
        SHASH_FOR_EACH (node, &new->table->schema->columns) {
            const struct ovsdb_column *column = node->data;
            const struct ovsdb_type *type = &column->type;
            unsigned int idx = column->index;
            struct ovsdb_datum datum;
            struct json *column_json;

            if (idx != OVSDB_COL_UUID && column->persistent
                && (old
                    ? bitmap_is_set(changed, idx)
                    : !ovsdb_datum_is_default(&new->fields[idx], type)))
            {
                if (old && use_column_diff) {
                    ovsdb_datum_diff(&datum, &old->fields[idx],
                                     &new->fields[idx], type);
                    if (allow_shallow_copies) {
                        column_json = ovsdb_datum_to_json(&datum, type);
                    } else {
                        column_json = ovsdb_datum_to_json_deep(&datum, type);
                    }
                    ovsdb_datum_destroy(&datum, type);
                } else {
                    if (allow_shallow_copies) {
                        column_json = ovsdb_datum_to_json(
                                            &new->fields[idx], type);
                    } else {
                        column_json = ovsdb_datum_to_json_deep(
                                            &new->fields[idx], type);
                    }
                }
                if (!row) {
                    row = json_object_create();
                }
                json_object_put(row, column->name, column_json);
            }
        }
    }

    if (row) {
        ovs_assert(new || old);
        struct ovsdb_table *table = new ? new->table : old->table;
        char uuid[UUID_LEN + 1];

        ovs_assert(table);

        if (table != ftxn->table) {
            /* Create JSON object for transaction overall. */
            if (!ftxn->json) {
                ftxn->json = json_object_create();
            }

            /* Create JSON object for transaction on this table. */
            ftxn->table_json = json_object_create();
            ftxn->table = table;
            json_object_put(ftxn->json, table->schema->name, ftxn->table_json);
        }

        /* Add row to transaction for this table. */
        snprintf(uuid, sizeof uuid,
                 UUID_FMT, UUID_ARGS(ovsdb_row_get_uuid(new ? new : old)));
        json_object_put(ftxn->table_json, uuid, row);
    }
}

static struct ovsdb *
ovsdb_file_read__(const char *filename, bool rw,
                  struct ovsdb_schema *new_schema)
{
    struct ovsdb_storage *storage = ovsdb_storage_open_standalone(filename,
                                                                  rw);
    struct ovsdb_schema *schema = ovsdb_storage_read_schema(storage);
    if (new_schema) {
        ovsdb_schema_destroy(schema);
        schema = new_schema;
    }
    struct ovsdb *ovsdb = ovsdb_create(schema, storage);
    for (;;) {
        /* Read a transaction.  Bail if end-of-file. */
        struct json *txn_json;
        struct ovsdb_schema *schema2;
        struct ovsdb_error *error = ovsdb_storage_read(storage, &schema2,
                                                       &txn_json, NULL);
        if (error) {
            ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
        }
        ovs_assert(!schema2);
        if (!txn_json) {
            break;
        }

        /* Apply transaction to database. */
        struct ovsdb_txn *txn;
        error = ovsdb_file_txn_from_json(ovsdb, txn_json, new_schema != NULL,
                                         &txn);
        if (error) {
            ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
        }
        json_destroy(txn_json);

        error = ovsdb_txn_replay_commit(txn);
        if (error) {
            ovsdb_error_destroy(error);
            ovsdb_storage_unread(storage);
            break;
        }
    }
    return ovsdb;
}

/* Reads 'filename' as a standalone database.  Returns the new database.  On
 * error, prints a message on stderr and terminates the process.
 *
 * If 'rw' is true, opens the database for read/write access, otherwise
 * read-only.
 *
 * Consumes 'schema'. */
struct ovsdb *
ovsdb_file_read(const char *filename, bool rw)
{
    return ovsdb_file_read__(filename, rw, NULL);
}

/* Reads 'filename' as a standalone database, using 'schema' in place of the
 * schema embedded in the file.  Returns the new database.  On error,
 * prints a message on stderr and terminates the process.
 *
 * Consumes 'schema'. */
struct ovsdb *
ovsdb_file_read_as_schema(const char *filename, struct ovsdb_schema *schema)
{
    return ovsdb_file_read__(filename, false, schema);
}
-												ovsdb-server: Distinguish logs from other replicas.

Until now, ovsdb-server has internally chained a list of replicas from each
database.  Whenever ovsdb_txn_commit() commits a transaction, it passes the
transaction to each replica.  The first replica, which is always the disk
file that stores the database, is special because it is the only replica
that can report an error and thereby abort the transaction.  This is a very
special property that genuinely distinguishes this first replica from the
others on the chain.  This commit breaks that first replica out as a
separate kind of entity that is not on the list of replicas.  When later
commits add support for clustering, there will only be more and more
special cases for the "first replica", so it makes sense to distinguish it
this way.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-09-12 16:28:28 -07:00
+								/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2016, 2017 Nicira, Inc.
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								 *
 								 * Licensed under the Apache License, Version 2.0 (the "License");
 								 * you may not use this file except in compliance with the License.
 								 * You may obtain a copy of the License at:
 								 *
 								 *     http://www.apache.org/licenses/LICENSE-2.0
 								 *
 								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS,
 								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								 * See the License for the specific language governing permissions and
 								 * limitations under the License.
 								 */
 								#include <config.h>
 								#include "file.h"
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								#include <errno.h>
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								#include <fcntl.h>
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								#include <unistd.h>
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
-												ovsdb: Check for changed columns only once per transaction commit.

Until now, each part of a transaction commit that is interested in whether
a column's value has changed has had to do a comparison of the old and new
values itself.  There can be several interested parties per commit
(generally one for file storage and one for each remove OVSDB connection),
so this seems like too much redundancy.  This commit adds a bitmap
to struct ovsdb_txn_row that tracks whether a column's value has actually
changed, to reduce this overhead.

As a convenient side effect of doing these checks up front, it then
becomes easily possible to drop txn_rows (and txn_tables and entire txns)
that become no-ops.  (This probably fixes bug #2400, which reported that
some no-ops actually report updates over monitors.)

											
										
										
											2010-03-11 17:14:31 -08:00
+								#include "bitmap.h"
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								#include "column.h"
 								#include "log.h"
-												json: Move from lib to include/openvswitch.

To easily allow both in- and out-of-tree building of the Python
wrapper for the OVS JSON parser (e.g. w/ pip), move json.h to
include/openvswitch. This also requires moving lib/{hmap,shash}.h.

Both hmap.h and shash.h were #include-ing "util.h" even though the
headers themselves did not use anything from there, but rather from
include/openvswitch/util.h. Fixing that required including util.h
in several C files mostly due to OVS_NOT_REACHED and things like
xmalloc.

Signed-off-by: Terry Wilson <twilson@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-07-12 16:37:34 -05:00
+								#include "openvswitch/json.h"
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								#include "lockfile.h"
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								#include "ovsdb.h"
 								#include "ovsdb-error.h"
 								#include "row.h"
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								#include "socket-util.h"
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								#include "storage.h"
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								#include "table.h"
-												ovsdb: Add "comment" feature to transactions and make ovs-vsctl use them.

The idea here is that transaction comments get copied to the ovsdb-server's
transaction log, which can then make it clear later why a particular change
was made to the database, to ease debugging.

											
										
										
											2009-12-16 13:30:53 -08:00
+								#include "timeval.h"
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								#include "transaction.h"
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								#include "unixctl.h"
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								#include "uuid.h"
 								#include "util.h"
-												lib: Move vlog.h to <openvswitch/vlog.h>

A new function vlog_insert_module() is introduced to avoid using
list_insert() from the vlog.h header.

Signed-off-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-12-15 14:10:38 +01:00
+								#include "openvswitch/vlog.h"
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
-												vlog: Make client supply semicolon for VLOG_DEFINE_THIS_MODULE.

It's kind of odd for VLOG_DEFINE_THIS_MODULE to supply its own semicolon,
so this commit switches to the more common form.

											
										
										
											2010-10-19 14:47:01 -07:00
+								VLOG_DEFINE_THIS_MODULE(ovsdb_file);
-												vlog: Introduce VLOG_DEFINE_THIS_MODULE for declaring vlog module in use.

Adding a macro to define the vlog module in use adds a level of
indirection, which makes it easier to change how the vlog module must be
defined.  A followup commit needs to do that, so getting these widespread
changes out of the way first should make that commit easier to review.

											
										
										
											2010-07-16 11:02:49 -07:00
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								/* A transaction being converted to JSON for writing to a file. */
 								struct ovsdb_file_txn {
 								    struct json *json;          /* JSON for the whole transaction. */
 								    struct json *table_json;    /* JSON for 'table''s transaction. */
 								    struct ovsdb_table *table;  /* Table described in 'table_json'.  */
 								};
 								static void ovsdb_file_txn_init(struct ovsdb_file_txn *);
 								static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *,
 								                                   const struct ovsdb_row *old,
-												ovsdb: Check for changed columns only once per transaction commit.

Until now, each part of a transaction commit that is interested in whether
a column's value has changed has had to do a comparison of the old and new
values itself.  There can be several interested parties per commit
(generally one for file storage and one for each remove OVSDB connection),
so this seems like too much redundancy.  This commit adds a bitmap
to struct ovsdb_txn_row that tracks whether a column's value has actually
changed, to reduce this overhead.

As a convenient side effect of doing these checks up front, it then
becomes easily possible to drop txn_rows (and txn_tables and entire txns)
that become no-ops.  (This probably fixes bug #2400, which reported that
some no-ops actually report updates over monitors.)

											
										
										
											2010-03-11 17:14:31 -08:00
+								                                   const struct ovsdb_row *new,
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								                                   const unsigned long int *changed,
 								                                   bool allow_shallow_copies);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								/* If set to 'true', file transactions will contain difference between
 								 * datums of old and new rows and not the whole new datum for the column. */
 								static bool use_column_diff = true;
 								static void
 								ovsdb_file_column_diff_enable(struct unixctl_conn *conn, int argc OVS_UNUSED,
 								                              const char *argv[] OVS_UNUSED,
 								                              void *arg OVS_UNUSED)
 								{
 								    use_column_diff = true;
 								    unixctl_command_reply(conn, NULL);
 								}
 								void
 								ovsdb_file_column_diff_disable(void)
 								{
 								    if (!use_column_diff) {
 								        return;
 								    }
 								    use_column_diff = false;
 								    unixctl_command_register("ovsdb/file/column-diff-enable", "",
 , 0, ovsdb_file_column_diff_enable, NULL);
 								}
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								static struct ovsdb_error *
 								ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting,
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                                bool row_contains_diff,
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								                                const struct json *json)
 								{
 								    struct ovsdb_table_schema *schema = row->table->schema;
 								    struct ovsdb_error *error;
 								    struct shash_node *node;
 								    if (json->type != JSON_OBJECT) {
 								        return ovsdb_syntax_error(json, NULL, "row must be JSON object");
 								    }
 								    SHASH_FOR_EACH (node, json_object(json)) {
 								        const char *column_name = node->name;
 								        const struct ovsdb_column *column;
 								        struct ovsdb_datum datum;
 								        column = ovsdb_table_schema_get_column(schema, column_name);
 								        if (!column) {
 								            if (converting) {
 								                continue;
 								            }
 								            return ovsdb_syntax_error(json, "unknown column",
 								                                      "No column %s in table %s.",
 								                                      column_name, schema->name);
 								        }
-												ovsdb: file: Fix inability to read diffs that violate type size.

Diff records in a database file may contain sets larger than a maximum
set size, so constraints should not be checked on read.  They will be
checked later after applying the diff to a column.

Fixes: 2ccd66f594f7 ("ovsdb: Use column diffs for ovsdb and raft log entries.")
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2023-July/406685.html
Reported-by: Peng He <xnhp0320@gmail.com>
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-07-25 11:32:17 +02:00
+								        if (row_contains_diff) {
 								            /* Diff may violate the type size rules. */
 								            error = ovsdb_transient_datum_from_json(&datum, &column->type,
 								                                                    node->data);
 								        } else {
 								            error = ovsdb_datum_from_json(&datum, &column->type,
 								                                          node->data, NULL);
 								        }
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								        if (error) {
 								            return error;
 								        }
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								        if (row_contains_diff
 								            && !ovsdb_datum_is_default(&row->fields[column->index],
 								                                       &column->type)) {
-												ovsdb-data: Add function to apply diff in-place.

ovsdb_datum_apply_diff() is heavily used in ovsdb transactions, but
it's linear in terms of number of comparisons.  And it also clones
all the atoms along the way.  In most cases size of a diff is much
smaller than the size of the original datum, this allows to perform
the same operation in-place with only O(diff->n * log2(old->n))
comparisons and O(old->n + diff->n) memory copies with memcpy.
Using this function while applying diffs read from the storage gives
a significant performance boost and allows to execute much more
transactions per second.

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Mark D. Gray <mark.d.gray@redhat.com>

											
										
										
											2021-09-23 01:47:24 +02:00
+								            error = ovsdb_datum_apply_diff_in_place(
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                                           &row->fields[column->index],
 								                                           &datum, &column->type);
 								            ovsdb_datum_destroy(&datum, &column->type);
 								            if (error) {
 								                return error;
 								            }
-												ovsdb-data: Add function to apply diff in-place.

ovsdb_datum_apply_diff() is heavily used in ovsdb transactions, but
it's linear in terms of number of comparisons.  And it also clones
all the atoms along the way.  In most cases size of a diff is much
smaller than the size of the original datum, this allows to perform
the same operation in-place with only O(diff->n * log2(old->n))
comparisons and O(old->n + diff->n) memory copies with memcpy.
Using this function while applying diffs read from the storage gives
a significant performance boost and allows to execute much more
transactions per second.

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Mark D. Gray <mark.d.gray@redhat.com>

											
										
										
											2021-09-23 01:47:24 +02:00
+								        } else {
 								            ovsdb_datum_swap(&row->fields[column->index], &datum);
 								            ovsdb_datum_destroy(&datum, &column->type);
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								        }
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								    }
 								    return NULL;
 								}
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								static struct ovsdb_error *
 								ovsdb_file_txn_row_from_json(struct ovsdb_txn *txn, struct ovsdb_table *table,
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                             bool converting, bool row_contains_diff,
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								                             const struct uuid *row_uuid, struct json *json)
 								{
 								    const struct ovsdb_row *row = ovsdb_table_get_row(table, row_uuid);
 								    if (json->type == JSON_NULL) {
 								        if (!row) {
 								            return ovsdb_syntax_error(NULL, NULL, "transaction deletes "
 								                                      "row "UUID_FMT" that does not exist",
 								                                      UUID_ARGS(row_uuid));
 								        }
 								        ovsdb_txn_row_delete(txn, row);
 								        return NULL;
 								    } else if (row) {
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								        return ovsdb_file_update_row_from_json(ovsdb_txn_row_modify(txn, row),
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                                               converting, row_contains_diff,
 								                                               json);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								    } else {
 								        struct ovsdb_error *error;
 								        struct ovsdb_row *new;
 								        new = ovsdb_row_create(table);
 								        *ovsdb_row_get_uuid_rw(new) = *row_uuid;
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								        error = ovsdb_file_update_row_from_json(new, converting,
 								                                                row_contains_diff, json);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        if (error) {
 								            ovsdb_row_destroy(new);
-												ovsdb: Fix segfault in ovsdb_file_txn_row_from_json().

If 'error' is nonnull then we destroy the row, so we must not try to reuse
the row immediately after that.

Support request #6155.
Repoted-by: Geoff White <gwhite@nicira.com>

											
										
										
											2011-07-06 14:22:42 -07:00
+								        } else {
 								            ovsdb_txn_row_insert(txn, new);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        }
 								        return error;
 								    }
 								}
 								static struct ovsdb_error *
 								ovsdb_file_txn_table_from_json(struct ovsdb_txn *txn,
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								                               struct ovsdb_table *table,
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                               bool converting,
 								                               bool row_contains_diff,
 								                               struct json *json)
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								{
 								    struct shash_node *node;
 								    if (json->type != JSON_OBJECT) {
 								        return ovsdb_syntax_error(json, NULL, "object expected");
 								    }
-												Embrace anonymous unions.

Several OVS structs contain embedded named unions, like this:

struct {
    ...
    union {
        ...
    } u;
};

C11 standardized a feature that many compilers already implemented
anyway, where an embedded union may be unnamed, like this:

struct {
    ...
    union {
        ...
    };
};

This is more convenient because it allows the programmer to omit "u."
in many places.  OVS already used this feature in several places.  This
commit embraces it in several others.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>

											
										
										
											2018-05-24 10:32:59 -07:00
+								    SHASH_FOR_EACH (node, json->object) {
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        const char *uuid_string = node->name;
 								        struct json *txn_row_json = node->data;
 								        struct ovsdb_error *error;
 								        struct uuid row_uuid;
 								        if (!uuid_from_string(&row_uuid, uuid_string)) {
 								            return ovsdb_syntax_error(json, NULL, "\"%s\" is not a valid UUID",
 								                                      uuid_string);
 								        }
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								        error = ovsdb_file_txn_row_from_json(txn, table, converting,
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                                             row_contains_diff,
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								                                             &row_uuid, txn_row_json);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        if (error) {
 								            return error;
 								        }
 								    }
 								    return NULL;
 								}
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								/* Converts 'json' to an ovsdb_txn for 'db', storing the new transaction in
 								 * '*txnp'.  Returns NULL if successful, otherwise an error.
 								 *
 								 * If 'converting' is true, then unknown table and column names are ignored
 								 * (which can ease upgrading and downgrading schemas); otherwise, they are
-												ovsdb: Use DB load time, not on-disk commit times, for compaction.

The ovsdb-server compaction timing logic is written assuming monotonic
time at milliscond resolution but it calculated the next compaction time
based on the oldest commit in the database. This was a problem because
commit timestamps are written in wall-clock time to second resolution.

This commit calculates the next compaction time based on the time when
the database was first loaded or the last compaction was done, both in
monotonic time at millisecond resolution.

Signed-off-by: Paul Ingram <pingram@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-13 18:52:53 -07:00
+								 * treated as errors. */
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								struct ovsdb_error *
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								ovsdb_file_txn_from_json(struct ovsdb *db, const struct json *json,
-												ovsdb: Use DB load time, not on-disk commit times, for compaction.

The ovsdb-server compaction timing logic is written assuming monotonic
time at milliscond resolution but it calculated the next compaction time
based on the oldest commit in the database. This was a problem because
commit timestamps are written in wall-clock time to second resolution.

This commit calculates the next compaction time based on the time when
the database was first loaded or the last compaction was done, both in
monotonic time at millisecond resolution.

Signed-off-by: Paul Ingram <pingram@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-13 18:52:53 -07:00
+								                         bool converting, struct ovsdb_txn **txnp)
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								{
 								    struct ovsdb_error *error;
 								    struct shash_node *node;
 								    struct ovsdb_txn *txn;
 								    *txnp = NULL;
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								    if (json->type != JSON_OBJECT) {
 								        return ovsdb_syntax_error(json, NULL, "object expected");
 								    }
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								    struct json *is_diff = shash_find_data(json->object, "_is_diff");
 								    bool row_contains_diff = false;
 								    if (is_diff && is_diff->type == JSON_TRUE) {
 								        row_contains_diff = true;
 								    }
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								    txn = ovsdb_txn_create(db);
-												Embrace anonymous unions.

Several OVS structs contain embedded named unions, like this:

struct {
    ...
    union {
        ...
    } u;
};

C11 standardized a feature that many compilers already implemented
anyway, where an embedded union may be unnamed, like this:

struct {
    ...
    union {
        ...
    };
};

This is more convenient because it allows the programmer to omit "u."
in many places.  OVS already used this feature in several places.  This
commit embraces it in several others.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>

											
										
										
											2018-05-24 10:32:59 -07:00
+								    SHASH_FOR_EACH (node, json->object) {
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        const char *table_name = node->name;
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								        struct json *node_json = node->data;
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        struct ovsdb_table *table;
 								        table = shash_find_data(&db->tables, table_name);
 								        if (!table) {
-												ovsdb: Add "comment" feature to transactions and make ovs-vsctl use them.

The idea here is that transaction comments get copied to the ovsdb-server's
transaction log, which can then make it clear later why a particular change
was made to the database, to ease debugging.

											
										
										
											2009-12-16 13:30:53 -08:00
+								            if (!strcmp(table_name, "_date")
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								                && node_json->type == JSON_INTEGER) {
 								                continue;
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								            } else if (!strcmp(table_name, "_is_diff")
 								                       && (node_json->type == JSON_TRUE
 								                           || node_json->type == JSON_FALSE)) {
 								                continue;
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								            } else if (!strcmp(table_name, "_comment") || converting) {
-												ovsdb: Add "comment" feature to transactions and make ovs-vsctl use them.

The idea here is that transaction comments get copied to the ovsdb-server's
transaction log, which can then make it clear later why a particular change
was made to the database, to ease debugging.

											
										
										
											2009-12-16 13:30:53 -08:00
+								                continue;
 								            }
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								            error = ovsdb_syntax_error(json, "unknown table",
 								                                       "No table named %s.", table_name);
 								            goto error;
 								        }
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								        error = ovsdb_file_txn_table_from_json(txn, table, converting,
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                                               row_contains_diff, node_json);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        if (error) {
 								            goto error;
 								        }
 								    }
 								    *txnp = txn;
 								    return NULL;
 								error:
 								    ovsdb_txn_abort(txn);
 								    return error;
 								}
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 								ovsdb_convert_table(struct ovsdb_txn *txn,
 								                    const struct ovsdb_table *src_table,
 								                    struct ovsdb_table *dst_table)
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								{
-												ovsdb: Don't convert unchanged columns during database conversion.

Column conversion involves converting it to json and back.  These are
heavy operations and completely unnecessary if the column type didn't
change.  Most of the time schema changes only add new columns/tables
without changing existing ones at all.  Clone the column instead to
save some time.

This will also save time while destroying the original database since
we will only need to reduce reference counters on unchanged datum
objects that were cloned instead of actually freeing them.

Additionally, moving the column lookup into a separate loop, so we
don't perform an shash lookup for each column of each row.

Testing with 440 MB OVN_Southbound database shows 70% speed up of the
ovsdb_convert() function.  Execution time reduced from 15 to 4.4
seconds, 3.5 of which is a post-conversion transaction replay.  Overall
time required for the online database conversion reduced from 37 to 25
seconds.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-01-03 18:47:36 +01:00
+								    const struct ovsdb_column **dst_columns;
 								    struct ovsdb_error *error = NULL;
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    const struct ovsdb_row *src_row;
-												ovsdb: Don't convert unchanged columns during database conversion.

Column conversion involves converting it to json and back.  These are
heavy operations and completely unnecessary if the column type didn't
change.  Most of the time schema changes only add new columns/tables
without changing existing ones at all.  Clone the column instead to
save some time.

This will also save time while destroying the original database since
we will only need to reduce reference counters on unchanged datum
objects that were cloned instead of actually freeing them.

Additionally, moving the column lookup into a separate loop, so we
don't perform an shash lookup for each column of each row.

Testing with 440 MB OVN_Southbound database shows 70% speed up of the
ovsdb_convert() function.  Execution time reduced from 15 to 4.4
seconds, 3.5 of which is a post-conversion transaction replay.  Overall
time required for the online database conversion reduced from 37 to 25
seconds.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-01-03 18:47:36 +01:00
+								    unsigned long *src_equal;
 								    struct shash_node *node;
 								    size_t n_src_columns;
 								    n_src_columns = shash_count(&src_table->schema->columns);
 								    src_equal = bitmap_allocate(n_src_columns);
 								    dst_columns = xzalloc(n_src_columns * sizeof *dst_columns);
 								    SHASH_FOR_EACH (node, &src_table->schema->columns) {
 								        const struct ovsdb_column *src_column = node->data;
 								        if (src_column->index == OVSDB_COL_UUID ||
 								            src_column->index == OVSDB_COL_VERSION) {
 								            continue;
 								        }
 								        const struct ovsdb_column *dst_column =
 								            shash_find_data(&dst_table->schema->columns, src_column->name);
 								        if (!dst_column) {
 								            continue;
 								        }
 								        dst_columns[src_column->index] = dst_column;
 								        if (ovsdb_type_equals(&src_column->type, &dst_column->type)) {
 								            bitmap_set1(src_equal, src_column->index);
 								        }
 								    }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) {
 								        struct ovsdb_row *dst_row = ovsdb_row_create(dst_table);
 								        *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row);
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								        SHASH_FOR_EACH (node, &src_table->schema->columns) {
 								            const struct ovsdb_column *src_column = node->data;
-												ovsdb: Don't convert unchanged columns during database conversion.

Column conversion involves converting it to json and back.  These are
heavy operations and completely unnecessary if the column type didn't
change.  Most of the time schema changes only add new columns/tables
without changing existing ones at all.  Clone the column instead to
save some time.

This will also save time while destroying the original database since
we will only need to reduce reference counters on unchanged datum
objects that were cloned instead of actually freeing them.

Additionally, moving the column lookup into a separate loop, so we
don't perform an shash lookup for each column of each row.

Testing with 440 MB OVN_Southbound database shows 70% speed up of the
ovsdb_convert() function.  Execution time reduced from 15 to 4.4
seconds, 3.5 of which is a post-conversion transaction replay.  Overall
time required for the online database conversion reduced from 37 to 25
seconds.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-01-03 18:47:36 +01:00
+								            const struct ovsdb_column *dst_column;
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
-												ovsdb: Don't convert unchanged columns during database conversion.

Column conversion involves converting it to json and back.  These are
heavy operations and completely unnecessary if the column type didn't
change.  Most of the time schema changes only add new columns/tables
without changing existing ones at all.  Clone the column instead to
save some time.

This will also save time while destroying the original database since
we will only need to reduce reference counters on unchanged datum
objects that were cloned instead of actually freeing them.

Additionally, moving the column lookup into a separate loop, so we
don't perform an shash lookup for each column of each row.

Testing with 440 MB OVN_Southbound database shows 70% speed up of the
ovsdb_convert() function.  Execution time reduced from 15 to 4.4
seconds, 3.5 of which is a post-conversion transaction replay.  Overall
time required for the online database conversion reduced from 37 to 25
seconds.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-01-03 18:47:36 +01:00
+								            dst_column = dst_columns[src_column->index];
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								            if (!dst_column) {
 								                continue;
 								            }
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
-												ovsdb-server: fix memory leak while converting database

Memory leak happens while converting existing database into new
database according to the specified schema (ovsdb-client convert
new-schema). Memory leak was detected by valgrind while executing
functional test "schema conversion online - clustered"

==16202== 96 bytes in 6 blocks are definitely lost in loss record 326 of 399
==16202==    at 0x4C2DB8F: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==16202==    by 0x44A5D4: xmalloc (util.c:138)
==16202==    by 0x4377A6: alloc_default_atoms (ovsdb-data.c:315)
==16202==    by 0x437F18: ovsdb_datum_init_default (ovsdb-data.c:918)
==16202==    by 0x413D82: ovsdb_row_create (row.c:59)
==16202==    by 0x40AA53: ovsdb_convert_table (file.c:220)
==16202==    by 0x40AA53: ovsdb_convert (file.c:275)
==16202==    by 0x416BE1: ovsdb_trigger_try (trigger.c:255)
==16202==    by 0x40D29E: ovsdb_jsonrpc_trigger_create (jsonrpc-server.c:1119)
==16202==    by 0x40D29E: ovsdb_jsonrpc_session_got_request (jsonrpc-server.c:986)
==16202==    by 0x40D29E: ovsdb_jsonrpc_session_run (jsonrpc-server.c:556)
==16202==    by 0x40D29E: ovsdb_jsonrpc_session_run_all (jsonrpc-server.c:586)
==16202==    by 0x40D29E: ovsdb_jsonrpc_server_run (jsonrpc-server.c:401)
==16202==    by 0x40682E: main_loop (ovsdb-server.c:209)
==16202==    by 0x40682E: main (ovsdb-server.c:460)

The problem was in ovsdb_datum_convert() function, which overrides
pointers to datum memory allocated in ovsdb_row_create() function.
Fix was done by freeing this memory before ovsdb_datum_convert()
is called.

Signed-off-by: Damijan Skvarc <damjan.skvarc@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2019-10-25 14:22:58 +02:00
+								            ovsdb_datum_destroy(&dst_row->fields[dst_column->index],
 								                                &dst_column->type);
-												ovsdb: Don't convert unchanged columns during database conversion.

Column conversion involves converting it to json and back.  These are
heavy operations and completely unnecessary if the column type didn't
change.  Most of the time schema changes only add new columns/tables
without changing existing ones at all.  Clone the column instead to
save some time.

This will also save time while destroying the original database since
we will only need to reduce reference counters on unchanged datum
objects that were cloned instead of actually freeing them.

Additionally, moving the column lookup into a separate loop, so we
don't perform an shash lookup for each column of each row.

Testing with 440 MB OVN_Southbound database shows 70% speed up of the
ovsdb_convert() function.  Execution time reduced from 15 to 4.4
seconds, 3.5 of which is a post-conversion transaction replay.  Overall
time required for the online database conversion reduced from 37 to 25
seconds.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-01-03 18:47:36 +01:00
+								            if (bitmap_is_set(src_equal, src_column->index)) {
 								                /* This column didn't change - no need to convert. */
 								                ovsdb_datum_clone(&dst_row->fields[dst_column->index],
 								                                  &src_row->fields[src_column->index]);
 								                continue;
 								            }
 								            error = ovsdb_datum_convert(
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								                &dst_row->fields[dst_column->index], &dst_column->type,
 								                &src_row->fields[src_column->index], &src_column->type);
 								            if (error) {
-												ovsdb-server: fix memory leak while converting database

Memory leak happens while converting existing database into new
database according to the specified schema (ovsdb-client convert
new-schema). Memory leak was detected by valgrind while executing
functional test "schema conversion online - clustered"

==16202== 96 bytes in 6 blocks are definitely lost in loss record 326 of 399
==16202==    at 0x4C2DB8F: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==16202==    by 0x44A5D4: xmalloc (util.c:138)
==16202==    by 0x4377A6: alloc_default_atoms (ovsdb-data.c:315)
==16202==    by 0x437F18: ovsdb_datum_init_default (ovsdb-data.c:918)
==16202==    by 0x413D82: ovsdb_row_create (row.c:59)
==16202==    by 0x40AA53: ovsdb_convert_table (file.c:220)
==16202==    by 0x40AA53: ovsdb_convert (file.c:275)
==16202==    by 0x416BE1: ovsdb_trigger_try (trigger.c:255)
==16202==    by 0x40D29E: ovsdb_jsonrpc_trigger_create (jsonrpc-server.c:1119)
==16202==    by 0x40D29E: ovsdb_jsonrpc_session_got_request (jsonrpc-server.c:986)
==16202==    by 0x40D29E: ovsdb_jsonrpc_session_run (jsonrpc-server.c:556)
==16202==    by 0x40D29E: ovsdb_jsonrpc_session_run_all (jsonrpc-server.c:586)
==16202==    by 0x40D29E: ovsdb_jsonrpc_server_run (jsonrpc-server.c:401)
==16202==    by 0x40682E: main_loop (ovsdb-server.c:209)
==16202==    by 0x40682E: main (ovsdb-server.c:460)

The problem was in ovsdb_datum_convert() function, which overrides
pointers to datum memory allocated in ovsdb_row_create() function.
Fix was done by freeing this memory before ovsdb_datum_convert()
is called.

Signed-off-by: Damijan Skvarc <damjan.skvarc@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2019-10-25 14:22:58 +02:00
+								                ovsdb_datum_init_empty(&dst_row->fields[dst_column->index]);
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								                ovsdb_row_destroy(dst_row);
-												ovsdb: Don't convert unchanged columns during database conversion.

Column conversion involves converting it to json and back.  These are
heavy operations and completely unnecessary if the column type didn't
change.  Most of the time schema changes only add new columns/tables
without changing existing ones at all.  Clone the column instead to
save some time.

This will also save time while destroying the original database since
we will only need to reduce reference counters on unchanged datum
objects that were cloned instead of actually freeing them.

Additionally, moving the column lookup into a separate loop, so we
don't perform an shash lookup for each column of each row.

Testing with 440 MB OVN_Southbound database shows 70% speed up of the
ovsdb_convert() function.  Execution time reduced from 15 to 4.4
seconds, 3.5 of which is a post-conversion transaction replay.  Overall
time required for the online database conversion reduced from 37 to 25
seconds.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-01-03 18:47:36 +01:00
+								                goto exit;
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								            }
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								        }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								        ovsdb_txn_row_insert(txn, dst_row);
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								    }
-												ovsdb: Don't convert unchanged columns during database conversion.

Column conversion involves converting it to json and back.  These are
heavy operations and completely unnecessary if the column type didn't
change.  Most of the time schema changes only add new columns/tables
without changing existing ones at all.  Clone the column instead to
save some time.

This will also save time while destroying the original database since
we will only need to reduce reference counters on unchanged datum
objects that were cloned instead of actually freeing them.

Additionally, moving the column lookup into a separate loop, so we
don't perform an shash lookup for each column of each row.

Testing with 440 MB OVN_Southbound database shows 70% speed up of the
ovsdb_convert() function.  Execution time reduced from 15 to 4.4
seconds, 3.5 of which is a post-conversion transaction replay.  Overall
time required for the online database conversion reduced from 37 to 25
seconds.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-01-03 18:47:36 +01:00
 								exit:
 								    free(dst_columns);
 								    bitmap_free(src_equal);
 								    return error;
-												ovsdb: Implement ovsdb-tool commands "compact" and "convert".

Partial fix for bug #2391.

											
										
										
											2010-02-12 11:26:54 -08:00
+								}
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								/* Copies the data in 'src', converts it into the schema specified in
 								 * 'new_schema', and puts it into a newly created, unbacked database, and
 								 * stores a pointer to the new database in '*dstp'.  Returns null if
 								 * successful, otherwise an error; on error, stores NULL in '*dstp'. */
 								struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 								ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema,
 								              struct ovsdb **dstp)
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								{
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    struct ovsdb *dst = ovsdb_create(ovsdb_schema_clone(new_schema),
-												ovsdb: storage: Allow setting the name for the unbacked storage.

ovsdb_create() requires schema or storage to be nonnull, but in
practice it requires to have schema name or a storage name to
use it as a database name.  Only clustered storage has a name.
This means that only clustered database can be created without
schema,  Changing that by allowing unbacked storage to have a
name.  This way we can create database with unbacked storage
without schema.  Will be used in next commits to create database
for ovsdb 'relay' service model.

Acked-by: Mark D. Gray <mark.d.gray@redhat.com>
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2021-06-01 21:52:08 +02:00
+								                                     ovsdb_storage_create_unbacked(NULL));
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    struct ovsdb_txn *txn = ovsdb_txn_create(dst);
 								    struct ovsdb_error *error = NULL;
-												ovsdb: New function ovsdb_file_read_schema() for reading schema from db.

This new function saves reading the whole database when only the schema is
of interest.  This commit adapts ovsdb-tool to use it for the "db-version"
command.  Upcoming commits will introduce another caller.

											
										
										
											2011-02-08 15:36:21 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    struct shash_node *node;
 								    SHASH_FOR_EACH (node, &src->tables) {
 								        const char *table_name = node->name;
 								        struct ovsdb_table *src_table = node->data;
 								        struct ovsdb_table *dst_table = shash_find_data(&dst->tables,
 								                                                        table_name);
 								        if (!dst_table) {
 								            continue;
 								        }
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								        error = ovsdb_convert_table(txn, src_table, dst_table);
 								        if (error) {
 								            goto error;
 								        }
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								    }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    error = ovsdb_txn_replay_commit(txn);
 								    if (error) {
 								        txn = NULL;            /* ovsdb_txn_replay_commit() already aborted. */
 								        goto error;
 								    }
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    *dstp = dst;
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								    return NULL;
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								error:
 								    ovsdb_destroy(dst);
 								    if (txn) {
 								        ovsdb_txn_abort(txn);
 								    }
 								    *dstp = NULL;
 								    return error;
 								}
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								static bool
-												ovsdb: Rename ovsdb_file_replica to ovsdb_file.

This is in preparation for exposing ovsdb_file to clients outside this
translation unit.  These clients don't care that the ovsdb_file is an
ovsdb replica--that's an implementation detail--and so it makes sense to
rename it from their point of view.

This is just a search-and-replace plus reindenting where appropriate.

											
										
										
											2010-03-16 15:13:42 -07:00
+								ovsdb_file_change_cb(const struct ovsdb_row *old,
 								                     const struct ovsdb_row *new,
 								                     const unsigned long int *changed,
 								                     void *ftxn_)
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								{
 								    struct ovsdb_file_txn *ftxn = ftxn_;
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								    ovsdb_file_txn_add_row(ftxn, old, new, changed, true);
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								    return true;
 								}
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								/* Converts the database into transaction JSON representation.
 								 * If 'allow_shallow_copies' is false, makes sure that all the JSON
 								 * objects in the resulted transaction JSON are separately allocated
 								 * objects and not shallow clones of JSON objects already existing
 								 * in the database.  Useful when multiple threads are working on the
 								 * same database object. */
-												ovsdb-client: Add new "backup" command.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-15 08:35:41 -08:00
+								struct json *
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								ovsdb_to_txn_json(const struct ovsdb *db, const char *comment,
 								                  bool allow_shallow_copies)
-												ovsdb-client: Add new "backup" command.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-15 08:35:41 -08:00
+								{
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    struct ovsdb_file_txn ftxn;
 								    ovsdb_file_txn_init(&ftxn);
 								    struct shash_node *node;
 								    SHASH_FOR_EACH (node, &db->tables) {
 								        const struct ovsdb_table *table = node->data;
 								        const struct ovsdb_row *row;
 								        HMAP_FOR_EACH (row, hmap_node, &table->rows) {
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								            ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL,
 								                                   allow_shallow_copies);
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								        }
-												ovsdb-client: Add new "backup" command.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-15 08:35:41 -08:00
+								    }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
 								    return ovsdb_file_txn_annotate(ftxn.json, comment);
-												ovsdb-client: Add new "backup" command.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-15 08:35:41 -08:00
+								}
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								/* Returns 'txn' transformed into the JSON format that is used in OVSDB files.
 								 * (But the caller must use ovsdb_file_txn_annotate() to add the _comment and
 								 * _date members.)  If 'txn' doesn't actually change anything, returns NULL */
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								struct json *
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								ovsdb_file_txn_to_json(const struct ovsdb_txn *txn)
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								{
 								    struct ovsdb_file_txn ftxn;
 								    ovsdb_file_txn_init(&ftxn);
-												ovsdb: Rename ovsdb_file_replica to ovsdb_file.

This is in preparation for exposing ovsdb_file to clients outside this
translation unit.  These clients don't care that the ovsdb_file is an
ovsdb replica--that's an implementation detail--and so it makes sense to
rename it from their point of view.

This is just a search-and-replace plus reindenting where appropriate.

											
										
										
											2010-03-16 15:13:42 -07:00
+								    ovsdb_txn_for_each_change(txn, ovsdb_file_change_cb, &ftxn);
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								    return ftxn.json;
 								}
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								struct json *
 								ovsdb_file_txn_annotate(struct json *json, const char *comment)
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								{
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    if (!json) {
 								        json = json_object_create();
-												ovsdb: Compact databases online automatically and on-demand.

If the database grows fairly large, and we've written a fair number of
transactions to it, and it's been a while since the database was compacted,
then (after the next commit) compact the database.

Also, compact the database online if the "ovsdb-server/compact" command is
issued via unixctl.  I suspect that this feature will rarely if ever be
used in practice, but it's easier to test than compacting automatically.

Bug #2391.

											
										
										
											2010-03-18 11:24:55 -07:00
+								    }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    if (comment) {
 								        json_object_put_string(json, "_comment", comment);
-												ovsdb-server: Distinguish logs from other replicas.

Until now, ovsdb-server has internally chained a list of replicas from each
database.  Whenever ovsdb_txn_commit() commits a transaction, it passes the
transaction to each replica.  The first replica, which is always the disk
file that stores the database, is special because it is the only replica
that can report an error and thereby abort the transaction.  This is a very
special property that genuinely distinguishes this first replica from the
others on the chain.  This commit breaks that first replica out as a
separate kind of entity that is not on the list of replicas.  When later
commits add support for clustering, there will only be more and more
special cases for the "first replica", so it makes sense to distinguish it
this way.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-09-12 16:28:28 -07:00
+								    }
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								    if (use_column_diff) {
 								        json_object_put(json, "_is_diff", json_boolean_create(true));
 								    }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    json_object_put(json, "_date", json_integer_create(time_wall_msec()));
 								    return json;
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								}
 								static void
 								ovsdb_file_txn_init(struct ovsdb_file_txn *ftxn)
 								{
 								    ftxn->json = NULL;
 								    ftxn->table_json = NULL;
 								    ftxn->table = NULL;
 								}
 								static void
 								ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn,
 								                       const struct ovsdb_row *old,
-												ovsdb: Check for changed columns only once per transaction commit.

Until now, each part of a transaction commit that is interested in whether
a column's value has changed has had to do a comparison of the old and new
values itself.  There can be several interested parties per commit
(generally one for file storage and one for each remove OVSDB connection),
so this seems like too much redundancy.  This commit adds a bitmap
to struct ovsdb_txn_row that tracks whether a column's value has actually
changed, to reduce this overhead.

As a convenient side effect of doing these checks up front, it then
becomes easily possible to drop txn_rows (and txn_tables and entire txns)
that become no-ops.  (This probably fixes bug #2400, which reported that
some no-ops actually report updates over monitors.)

											
										
										
											2010-03-11 17:14:31 -08:00
+								                       const struct ovsdb_row *new,
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								                       const unsigned long int *changed,
 								                       bool allow_shallow_copies)
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								{
 								    struct json *row;
 								    if (!new) {
 								        row = json_null_create();
 								    } else {
 								        struct shash_node *node;
-												ovsdb: Fix commit to disk of rows added to a table with all-default values.

											
										
										
											2010-02-12 11:13:24 -08:00
+								        row = old ? NULL : json_object_create();
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        SHASH_FOR_EACH (node, &new->table->schema->columns) {
 								            const struct ovsdb_column *column = node->data;
 								            const struct ovsdb_type *type = &column->type;
 								            unsigned int idx = column->index;
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								            struct ovsdb_datum datum;
 								            struct json *column_json;
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
 								            if (idx != OVSDB_COL_UUID && column->persistent
-												ovsdb: Save some space in the log for newly inserted records.

When a new record is inserted into a database, ovsdb logs the values of all
of the fields in the record.  However, often new records have many columns
that contain default values.  There is no need to log those values, so this
commit causes them to be omitted.

As a side effect, this also makes "ovsdb-tool show-log --more --more"
output easier to read, because record insertions print less noise.  (Adding
--more --more to this command makes it print changes to database records.
The --more option will be introduced in an upcoming commit.)

											
										
										
											2010-01-11 13:14:54 -08:00
+								                && (old
-												ovsdb: Check for changed columns only once per transaction commit.

Until now, each part of a transaction commit that is interested in whether
a column's value has changed has had to do a comparison of the old and new
values itself.  There can be several interested parties per commit
(generally one for file storage and one for each remove OVSDB connection),
so this seems like too much redundancy.  This commit adds a bitmap
to struct ovsdb_txn_row that tracks whether a column's value has actually
changed, to reduce this overhead.

As a convenient side effect of doing these checks up front, it then
becomes easily possible to drop txn_rows (and txn_tables and entire txns)
that become no-ops.  (This probably fixes bug #2400, which reported that
some no-ops actually report updates over monitors.)

											
										
										
											2010-03-11 17:14:31 -08:00
+								                    ? bitmap_is_set(changed, idx)
-												ovsdb: Save some space in the log for newly inserted records.

When a new record is inserted into a database, ovsdb logs the values of all
of the fields in the record.  However, often new records have many columns
that contain default values.  There is no need to log those values, so this
commit causes them to be omitted.

As a side effect, this also makes "ovsdb-tool show-log --more --more"
output easier to read, because record insertions print less noise.  (Adding
--more --more to this command makes it print changes to database records.
The --more option will be introduced in an upcoming commit.)

											
										
										
											2010-01-11 13:14:54 -08:00
+								                    : !ovsdb_datum_is_default(&new->fields[idx], type)))
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								            {
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                if (old && use_column_diff) {
 								                    ovsdb_datum_diff(&datum, &old->fields[idx],
 								                                     &new->fields[idx], type);
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								                    if (allow_shallow_copies) {
 								                        column_json = ovsdb_datum_to_json(&datum, type);
 								                    } else {
 								                        column_json = ovsdb_datum_to_json_deep(&datum, type);
 								                    }
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                    ovsdb_datum_destroy(&datum, type);
 								                } else {
-												ovsdb: Fix race for datum JSON string reference counter.

Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time.  However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object.  If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.

Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread.  This way all
the database operations will be truly read-only avoiding the race.

'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.

Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers.  Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.

Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-10-10 15:11:57 +02:00
+								                    if (allow_shallow_copies) {
 								                        column_json = ovsdb_datum_to_json(
 								                                            &new->fields[idx], type);
 								                    } else {
 								                        column_json = ovsdb_datum_to_json_deep(
 								                                            &new->fields[idx], type);
 								                    }
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                }
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								                if (!row) {
 								                    row = json_object_create();
 								                }
-												ovsdb: Use column diffs for ovsdb and raft log entries.

Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed.  This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new.  Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.

There could be thousands of acls in one port group or thousands of ports
in a single logical switch.  And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster.  This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks.  And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.

Simple test:

1. Start OVN sandbox with clustered DBs:
   # make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'

2. Run a script that creates one port group and adds 4000 acls into it:
   # cat ../memory-test.sh
   pg_name=my_port_group
   export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
   ovn-nbctl pg-add $pg_name
   for i in $(seq 1 4000); do
     echo "Iteration: $i"
     ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
   done
   ovn-nbctl acl-del $pg_name
   ovn-nbctl pg-del $pg_name
   ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
   ovn-appctl -t ovn-nbctl exit
   ---

4. Check the current memory consumption of ovsdb-server processes and
   space occupied by database files:
   # ls sandbox/[ns]b*.db -alh
   # ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'

Test results with current ovsdb log format:

   On-disk Nb DB size     :  ~369 MB
   RSS of Nb ovsdb-servers:  ~2.7 GB
   Time to finish the test:  ~2m

In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead.  This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).

Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.

One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value.  However, such
operations are rare.

Test results with change applied:

   On-disk Nb DB size     :  ~2.7 MB  ---> reduced by 99%
   RSS of Nb ovsdb-servers:  ~580 MB  ---> reduced by 78%
   Time to finish the test:  ~1m27s   ---> reduced by 27%

After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster.  However we still need to
handle cluster upgrades.  For this case added special command line
argument to disable new functionality.  Documentation updated with the
recommended way to upgrade the ovsdb cluster.

Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2020-12-11 21:54:47 +01:00
+								                json_object_put(row, column->name, column_json);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								            }
 								        }
 								    }
 								    if (row) {
-												file, monitor: Add null pointer assertions for old and new ovsdb_rows.

This commit adds non-null pointer assertions in some code that performs
some decisions based on old and new input ovsdb_rows.

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: James Raphael Tiovalen <jamestiotio@gmail.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-06-14 02:34:40 +08:00
+								        ovs_assert(new || old);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        struct ovsdb_table *table = new ? new->table : old->table;
 								        char uuid[UUID_LEN + 1];
-												file, monitor: Add null pointer assertions for old and new ovsdb_rows.

This commit adds non-null pointer assertions in some code that performs
some decisions based on old and new input ovsdb_rows.

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: James Raphael Tiovalen <jamestiotio@gmail.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2023-06-14 02:34:40 +08:00
+								        ovs_assert(table);
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								        if (table != ftxn->table) {
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								            /* Create JSON object for transaction overall. */
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								            if (!ftxn->json) {
 								                ftxn->json = json_object_create();
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								            }
 								            /* Create JSON object for transaction on this table. */
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								            ftxn->table_json = json_object_create();
 								            ftxn->table = table;
 								            json_object_put(ftxn->json, table->schema->name, ftxn->table_json);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								        }
 								        /* Add row to transaction for this table. */
 								        snprintf(uuid, sizeof uuid,
 								                 UUID_FMT, UUID_ARGS(ovsdb_row_get_uuid(new ? new : old)));
-												ovsdb: Refactor code for writing a transaction to a file.

An upcoming commit will add another user for this code, so it is good to
abstract it a little better.

											
										
										
											2010-02-10 16:35:24 -08:00
+								        json_object_put(ftxn->table_json, uuid, row);
-												ovsdb: Add replication support and refactor files in terms of replication.

An upcoming commit will add support for replicating tables across JSON-RPC
connection.  As a prerequisite ovsdb itself must support basic replication.
This commit adds that support and then reimplements the ovsdb file storage
in terms of that replication.

											
										
										
											2009-11-13 13:37:55 -08:00
+								    }
 								}
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								static struct ovsdb *
 								ovsdb_file_read__(const char *filename, bool rw,
 								                  struct ovsdb_schema *new_schema)
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								{
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    struct ovsdb_storage *storage = ovsdb_storage_open_standalone(filename,
 								                                                                  rw);
 								    struct ovsdb_schema *schema = ovsdb_storage_read_schema(storage);
 								    if (new_schema) {
 								        ovsdb_schema_destroy(schema);
 								        schema = new_schema;
 								    }
 								    struct ovsdb *ovsdb = ovsdb_create(schema, storage);
 								    for (;;) {
 								        /* Read a transaction.  Bail if end-of-file. */
 								        struct json *txn_json;
 								        struct ovsdb_schema *schema2;
 								        struct ovsdb_error *error = ovsdb_storage_read(storage, &schema2,
 								                                                       &txn_json, NULL);
 								        if (error) {
 								            ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								        }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								        ovs_assert(!schema2);
 								        if (!txn_json) {
 								            break;
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								        }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								        /* Apply transaction to database. */
 								        struct ovsdb_txn *txn;
 								        error = ovsdb_file_txn_from_json(ovsdb, txn_json, new_schema != NULL,
 								                                         &txn);
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								        if (error) {
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								            ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								        }
 								        json_destroy(txn_json);
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
 								        error = ovsdb_txn_replay_commit(txn);
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								        if (error) {
-												ovsdb: Fix memory leak on error path in ovsdb_file_read__().

Found by Coverity.

Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.")
Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
Acked-by: Mike Pattrick <mkp@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

											
										
										
											2022-06-06 19:06:38 +08:00
+								            ovsdb_error_destroy(error);
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								            ovsdb_storage_unread(storage);
 								            break;
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								        }
 								    }
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								    return ovsdb;
 								}
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								/* Reads 'filename' as a standalone database.  Returns the new database.  On
 								 * error, prints a message on stderr and terminates the process.
 								 *
 								 * If 'rw' is true, opens the database for read/write access, otherwise
 								 * read-only.
 								 *
 								 * Consumes 'schema'. */
 								struct ovsdb *
 								ovsdb_file_read(const char *filename, bool rw)
 								{
 								    return ovsdb_file_read__(filename, rw, NULL);
 								}
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
-												ovsdb: Introduce experimental support for clustered databases.

This commit adds support for OVSDB clustering via Raft.  Please read
ovsdb(7) for information on how to set up a clustered database.  It is
simple and boils down to running "ovsdb-tool create-cluster" on one server
and "ovsdb-tool join-cluster" on each of the others and then starting
ovsdb-server in the usual way on all of them.

One you have a clustered database, you configure ovn-controller and
ovn-northd to use it by pointing them to all of the servers, e.g. where
previously you might have said "tcp:1.2.3.4" was the database server,
now you say that it is "tcp:1.2.3.4,tcp:5.6.7.8,tcp:9.10.11.12".

This also adds support for database clustering to ovs-sandbox.

Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: aginwala <aginwala@asu.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-12-31 21:15:58 -08:00
+								/* Reads 'filename' as a standalone database, using 'schema' in place of the
 								 * schema embedded in the file.  Returns the new database.  On error,
 								 * prints a message on stderr and terminates the process.
 								 *
 								 * Consumes 'schema'. */
 								struct ovsdb *
 								ovsdb_file_read_as_schema(const char *filename, struct ovsdb_schema *schema)
 								{
 								    return ovsdb_file_read__(filename, false, schema);
-												ovsdb: Add support for online schema conversion.

With this change, "ovsdb-client convert" can be used to convert a database
from one schema to another without taking the database offline.

This can be useful to minimize downtime for a database during a software
upgrade.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-28 13:21:11 -08:00
+								}