2017-09-12 16:28:28 -07:00
|
|
|
|
/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2016, 2017 Nicira, Inc.
|
2009-11-13 13:37:55 -08:00
|
|
|
|
*
|
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
|
* You may obtain a copy of the License at:
|
|
|
|
|
*
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
*
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
|
* limitations under the License.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
|
|
|
|
|
#include "file.h"
|
|
|
|
|
|
2010-03-18 11:24:55 -07:00
|
|
|
|
#include <errno.h>
|
2009-11-13 13:37:55 -08:00
|
|
|
|
#include <fcntl.h>
|
2010-03-18 11:24:55 -07:00
|
|
|
|
#include <unistd.h>
|
2009-11-13 13:37:55 -08:00
|
|
|
|
|
2010-03-11 17:14:31 -08:00
|
|
|
|
#include "bitmap.h"
|
2009-11-13 13:37:55 -08:00
|
|
|
|
#include "column.h"
|
|
|
|
|
#include "log.h"
|
2016-07-12 16:37:34 -05:00
|
|
|
|
#include "openvswitch/json.h"
|
2010-03-18 11:24:55 -07:00
|
|
|
|
#include "lockfile.h"
|
2009-11-13 13:37:55 -08:00
|
|
|
|
#include "ovsdb.h"
|
|
|
|
|
#include "ovsdb-error.h"
|
|
|
|
|
#include "row.h"
|
2010-03-18 11:24:55 -07:00
|
|
|
|
#include "socket-util.h"
|
2017-12-31 21:15:58 -08:00
|
|
|
|
#include "storage.h"
|
2009-11-13 13:37:55 -08:00
|
|
|
|
#include "table.h"
|
2009-12-16 13:30:53 -08:00
|
|
|
|
#include "timeval.h"
|
2009-11-13 13:37:55 -08:00
|
|
|
|
#include "transaction.h"
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
#include "unixctl.h"
|
2009-11-13 13:37:55 -08:00
|
|
|
|
#include "uuid.h"
|
|
|
|
|
#include "util.h"
|
2014-12-15 14:10:38 +01:00
|
|
|
|
#include "openvswitch/vlog.h"
|
2009-11-13 13:37:55 -08:00
|
|
|
|
|
2010-10-19 14:47:01 -07:00
|
|
|
|
VLOG_DEFINE_THIS_MODULE(ovsdb_file);
|
2010-07-16 11:02:49 -07:00
|
|
|
|
|
2010-02-10 16:35:24 -08:00
|
|
|
|
/* A transaction being converted to JSON for writing to a file. */
|
|
|
|
|
struct ovsdb_file_txn {
|
|
|
|
|
struct json *json; /* JSON for the whole transaction. */
|
|
|
|
|
struct json *table_json; /* JSON for 'table''s transaction. */
|
|
|
|
|
struct ovsdb_table *table; /* Table described in 'table_json'. */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static void ovsdb_file_txn_init(struct ovsdb_file_txn *);
|
|
|
|
|
static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *,
|
|
|
|
|
const struct ovsdb_row *old,
|
2010-03-11 17:14:31 -08:00
|
|
|
|
const struct ovsdb_row *new,
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
const unsigned long int *changed,
|
|
|
|
|
bool allow_shallow_copies);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
/* If set to 'true', file transactions will contain difference between
|
|
|
|
|
* datums of old and new rows and not the whole new datum for the column. */
|
|
|
|
|
static bool use_column_diff = true;
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
ovsdb_file_column_diff_enable(struct unixctl_conn *conn, int argc OVS_UNUSED,
|
|
|
|
|
const char *argv[] OVS_UNUSED,
|
|
|
|
|
void *arg OVS_UNUSED)
|
|
|
|
|
{
|
|
|
|
|
use_column_diff = true;
|
|
|
|
|
unixctl_command_reply(conn, NULL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
ovsdb_file_column_diff_disable(void)
|
|
|
|
|
{
|
|
|
|
|
if (!use_column_diff) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
use_column_diff = false;
|
|
|
|
|
unixctl_command_register("ovsdb/file/column-diff-enable", "",
|
|
|
|
|
0, 0, ovsdb_file_column_diff_enable, NULL);
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-12 11:26:54 -08:00
|
|
|
|
static struct ovsdb_error *
|
|
|
|
|
ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting,
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
bool row_contains_diff,
|
2010-02-12 11:26:54 -08:00
|
|
|
|
const struct json *json)
|
|
|
|
|
{
|
|
|
|
|
struct ovsdb_table_schema *schema = row->table->schema;
|
|
|
|
|
struct ovsdb_error *error;
|
|
|
|
|
struct shash_node *node;
|
|
|
|
|
|
|
|
|
|
if (json->type != JSON_OBJECT) {
|
|
|
|
|
return ovsdb_syntax_error(json, NULL, "row must be JSON object");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SHASH_FOR_EACH (node, json_object(json)) {
|
|
|
|
|
const char *column_name = node->name;
|
|
|
|
|
const struct ovsdb_column *column;
|
|
|
|
|
struct ovsdb_datum datum;
|
|
|
|
|
|
|
|
|
|
column = ovsdb_table_schema_get_column(schema, column_name);
|
|
|
|
|
if (!column) {
|
|
|
|
|
if (converting) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
return ovsdb_syntax_error(json, "unknown column",
|
|
|
|
|
"No column %s in table %s.",
|
|
|
|
|
column_name, schema->name);
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-25 11:32:17 +02:00
|
|
|
|
if (row_contains_diff) {
|
|
|
|
|
/* Diff may violate the type size rules. */
|
|
|
|
|
error = ovsdb_transient_datum_from_json(&datum, &column->type,
|
|
|
|
|
node->data);
|
|
|
|
|
} else {
|
|
|
|
|
error = ovsdb_datum_from_json(&datum, &column->type,
|
|
|
|
|
node->data, NULL);
|
|
|
|
|
}
|
2010-02-12 11:26:54 -08:00
|
|
|
|
if (error) {
|
|
|
|
|
return error;
|
|
|
|
|
}
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
if (row_contains_diff
|
|
|
|
|
&& !ovsdb_datum_is_default(&row->fields[column->index],
|
|
|
|
|
&column->type)) {
|
2021-09-23 01:47:24 +02:00
|
|
|
|
error = ovsdb_datum_apply_diff_in_place(
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
&row->fields[column->index],
|
|
|
|
|
&datum, &column->type);
|
|
|
|
|
ovsdb_datum_destroy(&datum, &column->type);
|
|
|
|
|
if (error) {
|
|
|
|
|
return error;
|
|
|
|
|
}
|
2021-09-23 01:47:24 +02:00
|
|
|
|
} else {
|
|
|
|
|
ovsdb_datum_swap(&row->fields[column->index], &datum);
|
|
|
|
|
ovsdb_datum_destroy(&datum, &column->type);
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
}
|
2010-02-12 11:26:54 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2009-11-13 13:37:55 -08:00
|
|
|
|
static struct ovsdb_error *
|
|
|
|
|
ovsdb_file_txn_row_from_json(struct ovsdb_txn *txn, struct ovsdb_table *table,
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
bool converting, bool row_contains_diff,
|
2009-11-13 13:37:55 -08:00
|
|
|
|
const struct uuid *row_uuid, struct json *json)
|
|
|
|
|
{
|
|
|
|
|
const struct ovsdb_row *row = ovsdb_table_get_row(table, row_uuid);
|
|
|
|
|
if (json->type == JSON_NULL) {
|
|
|
|
|
if (!row) {
|
|
|
|
|
return ovsdb_syntax_error(NULL, NULL, "transaction deletes "
|
|
|
|
|
"row "UUID_FMT" that does not exist",
|
|
|
|
|
UUID_ARGS(row_uuid));
|
|
|
|
|
}
|
|
|
|
|
ovsdb_txn_row_delete(txn, row);
|
|
|
|
|
return NULL;
|
|
|
|
|
} else if (row) {
|
2010-02-12 11:26:54 -08:00
|
|
|
|
return ovsdb_file_update_row_from_json(ovsdb_txn_row_modify(txn, row),
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
converting, row_contains_diff,
|
|
|
|
|
json);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
} else {
|
|
|
|
|
struct ovsdb_error *error;
|
|
|
|
|
struct ovsdb_row *new;
|
|
|
|
|
|
|
|
|
|
new = ovsdb_row_create(table);
|
|
|
|
|
*ovsdb_row_get_uuid_rw(new) = *row_uuid;
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
error = ovsdb_file_update_row_from_json(new, converting,
|
|
|
|
|
row_contains_diff, json);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
if (error) {
|
|
|
|
|
ovsdb_row_destroy(new);
|
2011-07-06 14:22:42 -07:00
|
|
|
|
} else {
|
|
|
|
|
ovsdb_txn_row_insert(txn, new);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
}
|
|
|
|
|
return error;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static struct ovsdb_error *
|
|
|
|
|
ovsdb_file_txn_table_from_json(struct ovsdb_txn *txn,
|
2010-02-12 11:26:54 -08:00
|
|
|
|
struct ovsdb_table *table,
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
bool converting,
|
|
|
|
|
bool row_contains_diff,
|
|
|
|
|
struct json *json)
|
2009-11-13 13:37:55 -08:00
|
|
|
|
{
|
|
|
|
|
struct shash_node *node;
|
|
|
|
|
|
|
|
|
|
if (json->type != JSON_OBJECT) {
|
|
|
|
|
return ovsdb_syntax_error(json, NULL, "object expected");
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-24 10:32:59 -07:00
|
|
|
|
SHASH_FOR_EACH (node, json->object) {
|
2009-11-13 13:37:55 -08:00
|
|
|
|
const char *uuid_string = node->name;
|
|
|
|
|
struct json *txn_row_json = node->data;
|
|
|
|
|
struct ovsdb_error *error;
|
|
|
|
|
struct uuid row_uuid;
|
|
|
|
|
|
|
|
|
|
if (!uuid_from_string(&row_uuid, uuid_string)) {
|
|
|
|
|
return ovsdb_syntax_error(json, NULL, "\"%s\" is not a valid UUID",
|
|
|
|
|
uuid_string);
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-12 11:26:54 -08:00
|
|
|
|
error = ovsdb_file_txn_row_from_json(txn, table, converting,
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
row_contains_diff,
|
2010-02-12 11:26:54 -08:00
|
|
|
|
&row_uuid, txn_row_json);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
if (error) {
|
|
|
|
|
return error;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2010-03-18 11:24:55 -07:00
|
|
|
|
/* Converts 'json' to an ovsdb_txn for 'db', storing the new transaction in
|
|
|
|
|
* '*txnp'. Returns NULL if successful, otherwise an error.
|
|
|
|
|
*
|
|
|
|
|
* If 'converting' is true, then unknown table and column names are ignored
|
|
|
|
|
* (which can ease upgrading and downgrading schemas); otherwise, they are
|
2013-09-13 18:52:53 -07:00
|
|
|
|
* treated as errors. */
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb_error *
|
2009-11-13 13:37:55 -08:00
|
|
|
|
ovsdb_file_txn_from_json(struct ovsdb *db, const struct json *json,
|
2013-09-13 18:52:53 -07:00
|
|
|
|
bool converting, struct ovsdb_txn **txnp)
|
2009-11-13 13:37:55 -08:00
|
|
|
|
{
|
|
|
|
|
struct ovsdb_error *error;
|
|
|
|
|
struct shash_node *node;
|
|
|
|
|
struct ovsdb_txn *txn;
|
|
|
|
|
|
|
|
|
|
*txnp = NULL;
|
2010-03-18 11:24:55 -07:00
|
|
|
|
|
2009-11-13 13:37:55 -08:00
|
|
|
|
if (json->type != JSON_OBJECT) {
|
|
|
|
|
return ovsdb_syntax_error(json, NULL, "object expected");
|
|
|
|
|
}
|
|
|
|
|
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
struct json *is_diff = shash_find_data(json->object, "_is_diff");
|
|
|
|
|
bool row_contains_diff = false;
|
|
|
|
|
|
|
|
|
|
if (is_diff && is_diff->type == JSON_TRUE) {
|
|
|
|
|
row_contains_diff = true;
|
|
|
|
|
}
|
|
|
|
|
|
2009-11-13 13:37:55 -08:00
|
|
|
|
txn = ovsdb_txn_create(db);
|
2018-05-24 10:32:59 -07:00
|
|
|
|
SHASH_FOR_EACH (node, json->object) {
|
2009-11-13 13:37:55 -08:00
|
|
|
|
const char *table_name = node->name;
|
2010-03-18 11:24:55 -07:00
|
|
|
|
struct json *node_json = node->data;
|
2009-11-13 13:37:55 -08:00
|
|
|
|
struct ovsdb_table *table;
|
|
|
|
|
|
|
|
|
|
table = shash_find_data(&db->tables, table_name);
|
|
|
|
|
if (!table) {
|
2009-12-16 13:30:53 -08:00
|
|
|
|
if (!strcmp(table_name, "_date")
|
2010-03-18 11:24:55 -07:00
|
|
|
|
&& node_json->type == JSON_INTEGER) {
|
|
|
|
|
continue;
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
} else if (!strcmp(table_name, "_is_diff")
|
|
|
|
|
&& (node_json->type == JSON_TRUE
|
|
|
|
|
|| node_json->type == JSON_FALSE)) {
|
|
|
|
|
continue;
|
2010-03-18 11:24:55 -07:00
|
|
|
|
} else if (!strcmp(table_name, "_comment") || converting) {
|
2009-12-16 13:30:53 -08:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2009-11-13 13:37:55 -08:00
|
|
|
|
error = ovsdb_syntax_error(json, "unknown table",
|
|
|
|
|
"No table named %s.", table_name);
|
|
|
|
|
goto error;
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-12 11:26:54 -08:00
|
|
|
|
error = ovsdb_file_txn_table_from_json(txn, table, converting,
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
row_contains_diff, node_json);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
if (error) {
|
|
|
|
|
goto error;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
*txnp = txn;
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
error:
|
|
|
|
|
ovsdb_txn_abort(txn);
|
|
|
|
|
return error;
|
|
|
|
|
}
|
2010-02-12 11:26:54 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
|
ovsdb_convert_table(struct ovsdb_txn *txn,
|
|
|
|
|
const struct ovsdb_table *src_table,
|
|
|
|
|
struct ovsdb_table *dst_table)
|
2010-02-12 11:26:54 -08:00
|
|
|
|
{
|
2023-01-03 18:47:36 +01:00
|
|
|
|
const struct ovsdb_column **dst_columns;
|
|
|
|
|
struct ovsdb_error *error = NULL;
|
2017-12-31 21:15:58 -08:00
|
|
|
|
const struct ovsdb_row *src_row;
|
2023-01-03 18:47:36 +01:00
|
|
|
|
unsigned long *src_equal;
|
|
|
|
|
struct shash_node *node;
|
|
|
|
|
size_t n_src_columns;
|
|
|
|
|
|
|
|
|
|
n_src_columns = shash_count(&src_table->schema->columns);
|
|
|
|
|
src_equal = bitmap_allocate(n_src_columns);
|
|
|
|
|
dst_columns = xzalloc(n_src_columns * sizeof *dst_columns);
|
|
|
|
|
|
|
|
|
|
SHASH_FOR_EACH (node, &src_table->schema->columns) {
|
|
|
|
|
const struct ovsdb_column *src_column = node->data;
|
|
|
|
|
|
|
|
|
|
if (src_column->index == OVSDB_COL_UUID ||
|
|
|
|
|
src_column->index == OVSDB_COL_VERSION) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const struct ovsdb_column *dst_column =
|
|
|
|
|
shash_find_data(&dst_table->schema->columns, src_column->name);
|
|
|
|
|
|
|
|
|
|
if (!dst_column) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dst_columns[src_column->index] = dst_column;
|
|
|
|
|
|
|
|
|
|
if (ovsdb_type_equals(&src_column->type, &dst_column->type)) {
|
|
|
|
|
bitmap_set1(src_equal, src_column->index);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) {
|
|
|
|
|
struct ovsdb_row *dst_row = ovsdb_row_create(dst_table);
|
|
|
|
|
*ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row);
|
2010-02-12 11:26:54 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
SHASH_FOR_EACH (node, &src_table->schema->columns) {
|
|
|
|
|
const struct ovsdb_column *src_column = node->data;
|
2023-01-03 18:47:36 +01:00
|
|
|
|
const struct ovsdb_column *dst_column;
|
2010-02-12 11:26:54 -08:00
|
|
|
|
|
2023-01-03 18:47:36 +01:00
|
|
|
|
dst_column = dst_columns[src_column->index];
|
2017-12-31 21:15:58 -08:00
|
|
|
|
if (!dst_column) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2010-02-12 11:26:54 -08:00
|
|
|
|
|
2019-10-25 14:22:58 +02:00
|
|
|
|
ovsdb_datum_destroy(&dst_row->fields[dst_column->index],
|
|
|
|
|
&dst_column->type);
|
|
|
|
|
|
2023-01-03 18:47:36 +01:00
|
|
|
|
if (bitmap_is_set(src_equal, src_column->index)) {
|
|
|
|
|
/* This column didn't change - no need to convert. */
|
|
|
|
|
ovsdb_datum_clone(&dst_row->fields[dst_column->index],
|
|
|
|
|
&src_row->fields[src_column->index]);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
error = ovsdb_datum_convert(
|
2017-12-31 21:15:58 -08:00
|
|
|
|
&dst_row->fields[dst_column->index], &dst_column->type,
|
|
|
|
|
&src_row->fields[src_column->index], &src_column->type);
|
|
|
|
|
if (error) {
|
2019-10-25 14:22:58 +02:00
|
|
|
|
ovsdb_datum_init_empty(&dst_row->fields[dst_column->index]);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
ovsdb_row_destroy(dst_row);
|
2023-01-03 18:47:36 +01:00
|
|
|
|
goto exit;
|
2017-12-31 21:15:58 -08:00
|
|
|
|
}
|
2010-02-12 11:26:54 -08:00
|
|
|
|
}
|
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
ovsdb_txn_row_insert(txn, dst_row);
|
2010-02-12 11:26:54 -08:00
|
|
|
|
}
|
2023-01-03 18:47:36 +01:00
|
|
|
|
|
|
|
|
|
exit:
|
|
|
|
|
free(dst_columns);
|
|
|
|
|
bitmap_free(src_equal);
|
|
|
|
|
return error;
|
2010-02-12 11:26:54 -08:00
|
|
|
|
}
|
2010-03-18 11:24:55 -07:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
/* Copies the data in 'src', converts it into the schema specified in
|
|
|
|
|
* 'new_schema', and puts it into a newly created, unbacked database, and
|
|
|
|
|
* stores a pointer to the new database in '*dstp'. Returns null if
|
|
|
|
|
* successful, otherwise an error; on error, stores NULL in '*dstp'. */
|
|
|
|
|
struct ovsdb_error * OVS_WARN_UNUSED_RESULT
|
|
|
|
|
ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema,
|
|
|
|
|
struct ovsdb **dstp)
|
2010-03-18 11:24:55 -07:00
|
|
|
|
{
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb *dst = ovsdb_create(ovsdb_schema_clone(new_schema),
|
2021-06-01 21:52:08 +02:00
|
|
|
|
ovsdb_storage_create_unbacked(NULL));
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb_txn *txn = ovsdb_txn_create(dst);
|
|
|
|
|
struct ovsdb_error *error = NULL;
|
2011-02-08 15:36:21 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct shash_node *node;
|
|
|
|
|
SHASH_FOR_EACH (node, &src->tables) {
|
|
|
|
|
const char *table_name = node->name;
|
|
|
|
|
struct ovsdb_table *src_table = node->data;
|
|
|
|
|
struct ovsdb_table *dst_table = shash_find_data(&dst->tables,
|
|
|
|
|
table_name);
|
|
|
|
|
if (!dst_table) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2009-11-13 13:37:55 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
error = ovsdb_convert_table(txn, src_table, dst_table);
|
|
|
|
|
if (error) {
|
|
|
|
|
goto error;
|
|
|
|
|
}
|
2010-03-18 11:24:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
error = ovsdb_txn_replay_commit(txn);
|
|
|
|
|
if (error) {
|
|
|
|
|
txn = NULL; /* ovsdb_txn_replay_commit() already aborted. */
|
|
|
|
|
goto error;
|
|
|
|
|
}
|
2010-03-18 11:24:55 -07:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
*dstp = dst;
|
2010-03-18 11:24:55 -07:00
|
|
|
|
return NULL;
|
2009-11-13 13:37:55 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
error:
|
|
|
|
|
ovsdb_destroy(dst);
|
|
|
|
|
if (txn) {
|
|
|
|
|
ovsdb_txn_abort(txn);
|
|
|
|
|
}
|
|
|
|
|
*dstp = NULL;
|
|
|
|
|
return error;
|
|
|
|
|
}
|
|
|
|
|
|
2009-11-13 13:37:55 -08:00
|
|
|
|
static bool
|
2010-03-16 15:13:42 -07:00
|
|
|
|
ovsdb_file_change_cb(const struct ovsdb_row *old,
|
|
|
|
|
const struct ovsdb_row *new,
|
|
|
|
|
const unsigned long int *changed,
|
|
|
|
|
void *ftxn_)
|
2010-02-10 16:35:24 -08:00
|
|
|
|
{
|
|
|
|
|
struct ovsdb_file_txn *ftxn = ftxn_;
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
ovsdb_file_txn_add_row(ftxn, old, new, changed, true);
|
2010-02-10 16:35:24 -08:00
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
/* Converts the database into transaction JSON representation.
|
|
|
|
|
* If 'allow_shallow_copies' is false, makes sure that all the JSON
|
|
|
|
|
* objects in the resulted transaction JSON are separately allocated
|
|
|
|
|
* objects and not shallow clones of JSON objects already existing
|
|
|
|
|
* in the database. Useful when multiple threads are working on the
|
|
|
|
|
* same database object. */
|
2017-12-15 08:35:41 -08:00
|
|
|
|
struct json *
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
ovsdb_to_txn_json(const struct ovsdb *db, const char *comment,
|
|
|
|
|
bool allow_shallow_copies)
|
2017-12-15 08:35:41 -08:00
|
|
|
|
{
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb_file_txn ftxn;
|
|
|
|
|
|
|
|
|
|
ovsdb_file_txn_init(&ftxn);
|
|
|
|
|
|
|
|
|
|
struct shash_node *node;
|
|
|
|
|
SHASH_FOR_EACH (node, &db->tables) {
|
|
|
|
|
const struct ovsdb_table *table = node->data;
|
|
|
|
|
const struct ovsdb_row *row;
|
|
|
|
|
|
|
|
|
|
HMAP_FOR_EACH (row, hmap_node, &table->rows) {
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL,
|
|
|
|
|
allow_shallow_copies);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
}
|
2017-12-15 08:35:41 -08:00
|
|
|
|
}
|
2017-12-31 21:15:58 -08:00
|
|
|
|
|
|
|
|
|
return ovsdb_file_txn_annotate(ftxn.json, comment);
|
2017-12-15 08:35:41 -08:00
|
|
|
|
}
|
|
|
|
|
|
2017-12-28 13:21:11 -08:00
|
|
|
|
/* Returns 'txn' transformed into the JSON format that is used in OVSDB files.
|
|
|
|
|
* (But the caller must use ovsdb_file_txn_annotate() to add the _comment and
|
|
|
|
|
* _date members.) If 'txn' doesn't actually change anything, returns NULL */
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct json *
|
2017-12-28 13:21:11 -08:00
|
|
|
|
ovsdb_file_txn_to_json(const struct ovsdb_txn *txn)
|
2010-02-10 16:35:24 -08:00
|
|
|
|
{
|
|
|
|
|
struct ovsdb_file_txn ftxn;
|
|
|
|
|
|
|
|
|
|
ovsdb_file_txn_init(&ftxn);
|
2010-03-16 15:13:42 -07:00
|
|
|
|
ovsdb_txn_for_each_change(txn, ovsdb_file_change_cb, &ftxn);
|
2017-12-28 13:21:11 -08:00
|
|
|
|
return ftxn.json;
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct json *
|
|
|
|
|
ovsdb_file_txn_annotate(struct json *json, const char *comment)
|
2010-03-18 11:24:55 -07:00
|
|
|
|
{
|
2017-12-31 21:15:58 -08:00
|
|
|
|
if (!json) {
|
|
|
|
|
json = json_object_create();
|
2010-03-18 11:24:55 -07:00
|
|
|
|
}
|
2017-12-31 21:15:58 -08:00
|
|
|
|
if (comment) {
|
|
|
|
|
json_object_put_string(json, "_comment", comment);
|
2017-09-12 16:28:28 -07:00
|
|
|
|
}
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
if (use_column_diff) {
|
|
|
|
|
json_object_put(json, "_is_diff", json_boolean_create(true));
|
|
|
|
|
}
|
2017-12-31 21:15:58 -08:00
|
|
|
|
json_object_put(json, "_date", json_integer_create(time_wall_msec()));
|
|
|
|
|
return json;
|
2010-02-10 16:35:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
ovsdb_file_txn_init(struct ovsdb_file_txn *ftxn)
|
|
|
|
|
{
|
|
|
|
|
ftxn->json = NULL;
|
|
|
|
|
ftxn->table_json = NULL;
|
|
|
|
|
ftxn->table = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn,
|
|
|
|
|
const struct ovsdb_row *old,
|
2010-03-11 17:14:31 -08:00
|
|
|
|
const struct ovsdb_row *new,
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
const unsigned long int *changed,
|
|
|
|
|
bool allow_shallow_copies)
|
2009-11-13 13:37:55 -08:00
|
|
|
|
{
|
|
|
|
|
struct json *row;
|
|
|
|
|
|
|
|
|
|
if (!new) {
|
|
|
|
|
row = json_null_create();
|
|
|
|
|
} else {
|
|
|
|
|
struct shash_node *node;
|
|
|
|
|
|
2010-02-12 11:13:24 -08:00
|
|
|
|
row = old ? NULL : json_object_create();
|
2009-11-13 13:37:55 -08:00
|
|
|
|
SHASH_FOR_EACH (node, &new->table->schema->columns) {
|
|
|
|
|
const struct ovsdb_column *column = node->data;
|
|
|
|
|
const struct ovsdb_type *type = &column->type;
|
|
|
|
|
unsigned int idx = column->index;
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
struct ovsdb_datum datum;
|
|
|
|
|
struct json *column_json;
|
2009-11-13 13:37:55 -08:00
|
|
|
|
|
|
|
|
|
if (idx != OVSDB_COL_UUID && column->persistent
|
2010-01-11 13:14:54 -08:00
|
|
|
|
&& (old
|
2010-03-11 17:14:31 -08:00
|
|
|
|
? bitmap_is_set(changed, idx)
|
2010-01-11 13:14:54 -08:00
|
|
|
|
: !ovsdb_datum_is_default(&new->fields[idx], type)))
|
2009-11-13 13:37:55 -08:00
|
|
|
|
{
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
if (old && use_column_diff) {
|
|
|
|
|
ovsdb_datum_diff(&datum, &old->fields[idx],
|
|
|
|
|
&new->fields[idx], type);
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
if (allow_shallow_copies) {
|
|
|
|
|
column_json = ovsdb_datum_to_json(&datum, type);
|
|
|
|
|
} else {
|
|
|
|
|
column_json = ovsdb_datum_to_json_deep(&datum, type);
|
|
|
|
|
}
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
ovsdb_datum_destroy(&datum, type);
|
|
|
|
|
} else {
|
ovsdb: Fix race for datum JSON string reference counter.
Compaction thread supposed to not change anything in the database
it is working on, since the same data can be accessed by the main
thread at the same time. However, while converting database rows
to JSON objects, strings in the datum will be cloned using
json_clone(), which is a shallow copy, and that will change the
reference counter for the JSON string object. If both the main
thread and the compaction thread will clone/destroy the same object
at the same time we may end up with a broken reference counter
leading to a memory leak or use-after free.
Adding a new argument to the database to JSON conversion to prevent
use of shallow copies from the compaction thread. This way all
the database operations will be truly read-only avoiding the race.
'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used,
so creating separate variant for these functions instead of adding
a new argument, to avoid changing a lot of existing code.
Other solution might be to use atomic reference counters, but that
will require API/ABI break, because counter is exposed in public
headers. Also, we can not easily expose atomic functions, so we'll
need to un-inline reference counting with the associated performance
cost.
Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.")
Reported-at: https://bugzilla.redhat.com/2133431
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-10-10 15:11:57 +02:00
|
|
|
|
if (allow_shallow_copies) {
|
|
|
|
|
column_json = ovsdb_datum_to_json(
|
|
|
|
|
&new->fields[idx], type);
|
|
|
|
|
} else {
|
|
|
|
|
column_json = ovsdb_datum_to_json_deep(
|
|
|
|
|
&new->fields[idx], type);
|
|
|
|
|
}
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
}
|
2009-11-13 13:37:55 -08:00
|
|
|
|
if (!row) {
|
|
|
|
|
row = json_object_create();
|
|
|
|
|
}
|
ovsdb: Use column diffs for ovsdb and raft log entries.
Currently, ovsdb-server stores complete value for the column in a database
file and in a raft log in case this column changed. This means that
transaction that adds, for example, one new acl to a port group creates
a log entry with all UUIDs of all existing acls + one new. Same for
ports in logical switches and routers and more other columns with sets
in Northbound DB.
There could be thousands of acls in one port group or thousands of ports
in a single logical switch. And the typical use case is to add one new
if we're starting a new service/VM/container or adding one new node in a
kubernetes or OpenStack cluster. This generates huge amount of traffic
within ovsdb raft cluster, grows overall memory consumption and hurts
performance since all these UUIDs are parsed and formatted to/from json
several times and stored on disks. And more values we have in a set -
more space a single log entry will occupy and more time it will take to
process by ovsdb-server cluster members.
Simple test:
1. Start OVN sandbox with clustered DBs:
# make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run a script that creates one port group and adds 4000 acls into it:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
4. Check the current memory consumption of ovsdb-server processes and
space occupied by database files:
# ls sandbox/[ns]b*.db -alh
# ps -eo vsz,rss,comm,cmd | egrep '=[ns]b[123].pid'
Test results with current ovsdb log format:
On-disk Nb DB size : ~369 MB
RSS of Nb ovsdb-servers: ~2.7 GB
Time to finish the test: ~2m
In order to mitigate memory consumption issues and reduce computational
load on ovsdb-servers let's store diff between old and new values
instead. This will make size of each log entry that adds single acl to
port group (or port to logical switch or anything else like that) very
small and independent from the number of already existing acls (ports,
etc.).
Added a new marker '_is_diff' into a file transaction to specify that
this transaction contains diffs instead of replacements for the existing
data.
One side effect is that this change will actually increase the size of
file transaction that removes more than a half of entries from the set,
because diff will be larger than the resulted new value. However, such
operations are rare.
Test results with change applied:
On-disk Nb DB size : ~2.7 MB ---> reduced by 99%
RSS of Nb ovsdb-servers: ~580 MB ---> reduced by 78%
Time to finish the test: ~1m27s ---> reduced by 27%
After this change new ovsdb-server is still able to read old databases,
but old ovsdb-server will not be able to read new ones.
Since new servers could join ovsdb cluster dynamically it's hard to
implement any runtime mechanism to handle cases where different
versions of ovsdb-server joins the cluster. However we still need to
handle cluster upgrades. For this case added special command line
argument to disable new functionality. Documentation updated with the
recommended way to upgrade the ovsdb cluster.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-11 21:54:47 +01:00
|
|
|
|
json_object_put(row, column->name, column_json);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (row) {
|
2023-06-14 02:34:40 +08:00
|
|
|
|
ovs_assert(new || old);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
struct ovsdb_table *table = new ? new->table : old->table;
|
|
|
|
|
char uuid[UUID_LEN + 1];
|
|
|
|
|
|
2023-06-14 02:34:40 +08:00
|
|
|
|
ovs_assert(table);
|
|
|
|
|
|
2010-02-10 16:35:24 -08:00
|
|
|
|
if (table != ftxn->table) {
|
2009-11-13 13:37:55 -08:00
|
|
|
|
/* Create JSON object for transaction overall. */
|
2010-02-10 16:35:24 -08:00
|
|
|
|
if (!ftxn->json) {
|
|
|
|
|
ftxn->json = json_object_create();
|
2009-11-13 13:37:55 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Create JSON object for transaction on this table. */
|
2010-02-10 16:35:24 -08:00
|
|
|
|
ftxn->table_json = json_object_create();
|
|
|
|
|
ftxn->table = table;
|
|
|
|
|
json_object_put(ftxn->json, table->schema->name, ftxn->table_json);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add row to transaction for this table. */
|
|
|
|
|
snprintf(uuid, sizeof uuid,
|
|
|
|
|
UUID_FMT, UUID_ARGS(ovsdb_row_get_uuid(new ? new : old)));
|
2010-02-10 16:35:24 -08:00
|
|
|
|
json_object_put(ftxn->table_json, uuid, row);
|
2009-11-13 13:37:55 -08:00
|
|
|
|
}
|
|
|
|
|
}
|
2017-12-28 13:21:11 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
static struct ovsdb *
|
|
|
|
|
ovsdb_file_read__(const char *filename, bool rw,
|
|
|
|
|
struct ovsdb_schema *new_schema)
|
2017-12-28 13:21:11 -08:00
|
|
|
|
{
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb_storage *storage = ovsdb_storage_open_standalone(filename,
|
|
|
|
|
rw);
|
|
|
|
|
struct ovsdb_schema *schema = ovsdb_storage_read_schema(storage);
|
|
|
|
|
if (new_schema) {
|
|
|
|
|
ovsdb_schema_destroy(schema);
|
|
|
|
|
schema = new_schema;
|
|
|
|
|
}
|
|
|
|
|
struct ovsdb *ovsdb = ovsdb_create(schema, storage);
|
|
|
|
|
for (;;) {
|
|
|
|
|
/* Read a transaction. Bail if end-of-file. */
|
|
|
|
|
struct json *txn_json;
|
|
|
|
|
struct ovsdb_schema *schema2;
|
|
|
|
|
struct ovsdb_error *error = ovsdb_storage_read(storage, &schema2,
|
|
|
|
|
&txn_json, NULL);
|
|
|
|
|
if (error) {
|
|
|
|
|
ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
|
2017-12-28 13:21:11 -08:00
|
|
|
|
}
|
2017-12-31 21:15:58 -08:00
|
|
|
|
ovs_assert(!schema2);
|
|
|
|
|
if (!txn_json) {
|
|
|
|
|
break;
|
2017-12-28 13:21:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
/* Apply transaction to database. */
|
|
|
|
|
struct ovsdb_txn *txn;
|
|
|
|
|
error = ovsdb_file_txn_from_json(ovsdb, txn_json, new_schema != NULL,
|
|
|
|
|
&txn);
|
2017-12-28 13:21:11 -08:00
|
|
|
|
if (error) {
|
2017-12-31 21:15:58 -08:00
|
|
|
|
ovs_fatal(0, "%s", ovsdb_error_to_string_free(error));
|
2017-12-28 13:21:11 -08:00
|
|
|
|
}
|
|
|
|
|
json_destroy(txn_json);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
|
|
|
|
|
error = ovsdb_txn_replay_commit(txn);
|
2017-12-28 13:21:11 -08:00
|
|
|
|
if (error) {
|
2022-06-06 19:06:38 +08:00
|
|
|
|
ovsdb_error_destroy(error);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
ovsdb_storage_unread(storage);
|
|
|
|
|
break;
|
2017-12-28 13:21:11 -08:00
|
|
|
|
}
|
|
|
|
|
}
|
2017-12-31 21:15:58 -08:00
|
|
|
|
return ovsdb;
|
|
|
|
|
}
|
2017-12-28 13:21:11 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
/* Reads 'filename' as a standalone database. Returns the new database. On
|
|
|
|
|
* error, prints a message on stderr and terminates the process.
|
|
|
|
|
*
|
|
|
|
|
* If 'rw' is true, opens the database for read/write access, otherwise
|
|
|
|
|
* read-only.
|
|
|
|
|
*
|
|
|
|
|
* Consumes 'schema'. */
|
|
|
|
|
struct ovsdb *
|
|
|
|
|
ovsdb_file_read(const char *filename, bool rw)
|
|
|
|
|
{
|
|
|
|
|
return ovsdb_file_read__(filename, rw, NULL);
|
|
|
|
|
}
|
2017-12-28 13:21:11 -08:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
/* Reads 'filename' as a standalone database, using 'schema' in place of the
|
|
|
|
|
* schema embedded in the file. Returns the new database. On error,
|
|
|
|
|
* prints a message on stderr and terminates the process.
|
|
|
|
|
*
|
|
|
|
|
* Consumes 'schema'. */
|
|
|
|
|
struct ovsdb *
|
|
|
|
|
ovsdb_file_read_as_schema(const char *filename, struct ovsdb_schema *schema)
|
|
|
|
|
{
|
|
|
|
|
return ovsdb_file_read__(filename, false, schema);
|
2017-12-28 13:21:11 -08:00
|
|
|
|
}
|