2017-12-31 21:15:58 -08:00
|
|
|
|
/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
|
2009-11-04 15:11:44 -08:00
|
|
|
|
*
|
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
|
* You may obtain a copy of the License at:
|
|
|
|
|
*
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
*
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
|
* limitations under the License.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#ifndef OVSDB_OVSDB_H
|
|
|
|
|
#define OVSDB_OVSDB_H 1
|
|
|
|
|
|
|
|
|
|
#include "compiler.h"
|
2016-07-12 16:37:34 -05:00
|
|
|
|
#include "openvswitch/hmap.h"
|
2016-03-25 14:10:21 -07:00
|
|
|
|
#include "openvswitch/list.h"
|
2016-07-12 16:37:34 -05:00
|
|
|
|
#include "openvswitch/shash.h"
|
2017-12-31 21:15:58 -08:00
|
|
|
|
#include "openvswitch/uuid.h"
|
2022-07-13 22:38:36 +02:00
|
|
|
|
#include "ovs-thread.h"
|
2009-11-04 15:11:44 -08:00
|
|
|
|
|
|
|
|
|
struct json;
|
2009-11-13 13:37:55 -08:00
|
|
|
|
struct ovsdb_log;
|
2011-07-26 10:24:17 -07:00
|
|
|
|
struct ovsdb_session;
|
2009-11-13 13:37:55 -08:00
|
|
|
|
struct ovsdb_txn;
|
2012-05-08 15:44:21 -07:00
|
|
|
|
struct simap;
|
2009-11-04 15:11:44 -08:00
|
|
|
|
|
|
|
|
|
/* Database schema. */
|
|
|
|
|
struct ovsdb_schema {
|
|
|
|
|
char *name;
|
2010-12-27 14:26:47 -08:00
|
|
|
|
char *version;
|
2011-02-08 15:23:33 -08:00
|
|
|
|
char *cksum;
|
2009-11-04 15:11:44 -08:00
|
|
|
|
struct shash tables; /* Contains "struct ovsdb_table_schema *"s. */
|
|
|
|
|
};
|
|
|
|
|
|
2010-12-27 14:26:47 -08:00
|
|
|
|
struct ovsdb_schema *ovsdb_schema_create(const char *name,
|
2011-02-08 15:23:33 -08:00
|
|
|
|
const char *version,
|
|
|
|
|
const char *cksum);
|
2010-02-10 15:37:52 -08:00
|
|
|
|
struct ovsdb_schema *ovsdb_schema_clone(const struct ovsdb_schema *);
|
2009-11-04 15:11:44 -08:00
|
|
|
|
void ovsdb_schema_destroy(struct ovsdb_schema *);
|
|
|
|
|
|
|
|
|
|
struct ovsdb_error *ovsdb_schema_from_file(const char *file_name,
|
|
|
|
|
struct ovsdb_schema **)
|
2014-12-15 14:10:38 +01:00
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
2017-12-28 13:21:11 -08:00
|
|
|
|
struct ovsdb_error *ovsdb_schema_from_json(const struct json *,
|
2009-11-04 15:11:44 -08:00
|
|
|
|
struct ovsdb_schema **)
|
2014-12-15 14:10:38 +01:00
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
2009-11-04 15:11:44 -08:00
|
|
|
|
struct json *ovsdb_schema_to_json(const struct ovsdb_schema *);
|
2011-02-08 15:57:14 -08:00
|
|
|
|
|
|
|
|
|
bool ovsdb_schema_equal(const struct ovsdb_schema *,
|
|
|
|
|
const struct ovsdb_schema *);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
|
|
|
|
|
struct ovsdb_error *ovsdb_schema_check_for_ephemeral_columns(
|
|
|
|
|
const struct ovsdb_schema *) OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
void ovsdb_schema_persist_ephemeral_columns(struct ovsdb_schema *,
|
|
|
|
|
const char *filename);
|
|
|
|
|
|
|
|
|
|
struct ovsdb_version {
|
|
|
|
|
unsigned int x;
|
|
|
|
|
unsigned int y;
|
|
|
|
|
unsigned int z;
|
|
|
|
|
};
|
|
|
|
|
bool ovsdb_parse_version(const char *, struct ovsdb_version *);
|
|
|
|
|
bool ovsdb_is_valid_version(const char *);
|
2009-11-04 15:11:44 -08:00
|
|
|
|
|
|
|
|
|
/* Database. */
|
2019-02-28 09:15:17 -08:00
|
|
|
|
struct ovsdb_txn_history_node {
|
|
|
|
|
struct ovs_list node; /* Element in struct ovsdb's txn_history list */
|
|
|
|
|
struct ovsdb_txn *txn;
|
|
|
|
|
};
|
|
|
|
|
|
ovsdb: Prepare snapshot JSON in a separate thread.
Conversion of the database data into JSON object, serialization
and destruction of that object are the most heavy operations
during the database compaction. If these operations are moved
to a separate thread, the main thread can continue processing
database requests in the meantime.
With this change, the compaction is split in 3 phases:
1. Initialization:
- Create a copy of the database.
- Remember current database index.
- Start a separate thread to convert a copy of the database
into serialized JSON object.
2. Wait:
- Continue normal operation until compaction thread is done.
- Meanwhile, compaction thread:
* Convert database copy to JSON.
* Serialize resulted JSON.
* Destroy original JSON object.
3. Finish:
- Destroy the database copy.
- Take the snapshot created by the thread.
- Write on disk.
The key for this schema to be fast is the ability to create
a shallow copy of the database. This doesn't take too much
time allowing the thread to do most of work.
Database copy is created and destroyed only by the main thread,
so there is no need for synchronization.
Such solution allows to reduce the time main thread is blocked
by compaction by 80-90%. For example, in ovn-heater tests
with 120 node density-heavy scenario, where compaction normally
takes 5-6 seconds at the end of a test, measured compaction
times was all below 1 second with the change applied. Also,
note that these measured times are the sum of phases 1 and 3,
so actual poll intervals are about half a second in this case.
Only implemented for raft storage for now. The implementation
for standalone databases can be added later by using a file
offset as a database index and copying newly added changes
from the old file to a new one during ovsdb_log_replace().
Reported-at: https://bugzilla.redhat.com/2069108
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-07-01 01:34:07 +02:00
|
|
|
|
struct ovsdb_compaction_state {
|
|
|
|
|
pthread_t thread; /* Thread handle. */
|
|
|
|
|
|
|
|
|
|
struct ovsdb *db; /* Copy of a database data to compact. */
|
|
|
|
|
|
|
|
|
|
struct json *data; /* 'db' as a serialized json. */
|
|
|
|
|
struct json *schema; /* 'db' schema json. */
|
|
|
|
|
uint64_t applied_index; /* Last applied index reported by the storage
|
|
|
|
|
* at the moment of a database copy. */
|
|
|
|
|
|
|
|
|
|
/* Completion signaling. */
|
|
|
|
|
struct seq *done;
|
|
|
|
|
uint64_t seqno;
|
|
|
|
|
|
|
|
|
|
uint64_t init_time; /* Time spent by the main thread preparing. */
|
|
|
|
|
uint64_t thread_time; /* Time spent for compaction by the thread. */
|
|
|
|
|
};
|
|
|
|
|
|
2009-11-04 15:11:44 -08:00
|
|
|
|
struct ovsdb {
|
2017-12-31 21:15:58 -08:00
|
|
|
|
char *name;
|
2009-11-04 15:11:44 -08:00
|
|
|
|
struct ovsdb_schema *schema;
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb_storage *storage; /* If nonnull, log for transactions. */
|
|
|
|
|
struct uuid prereq;
|
2018-01-22 11:20:47 -08:00
|
|
|
|
struct ovs_list monitors; /* Contains "struct ovsdb_monitor"s. */
|
2009-11-04 15:11:44 -08:00
|
|
|
|
struct shash tables; /* Contains "struct ovsdb_table *"s. */
|
|
|
|
|
|
|
|
|
|
/* Triggers. */
|
2014-12-15 14:10:38 +01:00
|
|
|
|
struct ovs_list triggers; /* Contains "struct ovsdb_trigger"s. */
|
2009-11-04 15:11:44 -08:00
|
|
|
|
bool run_triggers;
|
raft: Avoid busy loop during leader election.
When a server doesn't see a leader yet, e.g. during leader re-election,
if a transaction comes from a client, it will cause 100% CPU busy loop.
With debug log enabled it is like:
2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
...
The problem is that in ovsdb_trigger_try(), all cluster errors are treated
as temporary error and retry immediately. This patch fixes it by introducing
'run_triggers_now', which tells if a retry is needed immediately. When the
cluster error is with detail 'not leader', we don't immediately retry, but
will wait for the next poll event to trigger the retry. When 'not leader'
status changes, there must be a event, i.e. raft RPC that changes the
status, so the trigger is guaranteed to be triggered, without busy loop.
Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>
2020-02-28 18:07:07 -08:00
|
|
|
|
bool run_triggers_now;
|
2017-05-31 19:04:32 -04:00
|
|
|
|
|
|
|
|
|
struct ovsdb_table *rbac_role;
|
2019-02-28 09:15:17 -08:00
|
|
|
|
|
|
|
|
|
/* History trasanctions for incremental monitor transfer. */
|
|
|
|
|
bool need_txn_history; /* Need to maintain history of transactions. */
|
|
|
|
|
unsigned int n_txn_history; /* Current number of history transactions. */
|
ovsdb: Don't let transaction history grow larger than the database.
If user frequently changes a lot of rows in a database, transaction
history could grow way larger than the database itself. This wastes
a lot of memory and also makes monitor_cond_since slower than
usual monotor_cond if the transaction id is old enough, because
re-construction of the changes from a history is slower than just
creation of initial database snapshot. This is also the case if
user deleted a lot of data, so transaction history still holds all of
it while the database itself doesn't.
In case of current lb-per-service model in ovn-kubernetes, each
load-balancer is added to every logical switch/router. Such a
transaction touches more than a half of a OVN_Northbound database.
And each of these transactions is added to the transaction history.
Since transaction history depth is 100, in worst case scenario,
it will hold 100 copies of a database increasing memory consumption
dramatically. In tests with 3000 LBs and 120 LSs, memory goes up
to 3 GB, while holding at 30 MB if transaction history disabled in
the code.
Fixing that by keeping count of the number of ovsdb_atom's in the
database and not allowing the total number of atoms in transaction
history to grow larger than this value. Counting atoms is fairly
cheap because we don't need to iterate over them, so it doesn't have
significant performance impact. It would be ideal to measure the
size of individual atoms, but that will hit the performance.
Counting cells instead of atoms is not sufficient, because OVN
users are adding hundreds or thousands of atoms to a single cell,
so they are largely different in size.
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Han Zhou <hzhou@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-09-28 13:17:21 +02:00
|
|
|
|
unsigned int n_txn_history_atoms; /* Total number of atoms in history. */
|
2019-02-28 09:15:17 -08:00
|
|
|
|
struct ovs_list txn_history; /* Contains "struct ovsdb_txn_history_node. */
|
2021-06-01 23:27:36 +02:00
|
|
|
|
|
ovsdb: Don't let transaction history grow larger than the database.
If user frequently changes a lot of rows in a database, transaction
history could grow way larger than the database itself. This wastes
a lot of memory and also makes monitor_cond_since slower than
usual monotor_cond if the transaction id is old enough, because
re-construction of the changes from a history is slower than just
creation of initial database snapshot. This is also the case if
user deleted a lot of data, so transaction history still holds all of
it while the database itself doesn't.
In case of current lb-per-service model in ovn-kubernetes, each
load-balancer is added to every logical switch/router. Such a
transaction touches more than a half of a OVN_Northbound database.
And each of these transactions is added to the transaction history.
Since transaction history depth is 100, in worst case scenario,
it will hold 100 copies of a database increasing memory consumption
dramatically. In tests with 3000 LBs and 120 LSs, memory goes up
to 3 GB, while holding at 30 MB if transaction history disabled in
the code.
Fixing that by keeping count of the number of ovsdb_atom's in the
database and not allowing the total number of atoms in transaction
history to grow larger than this value. Counting atoms is fairly
cheap because we don't need to iterate over them, so it doesn't have
significant performance impact. It would be ideal to measure the
size of individual atoms, but that will hit the performance.
Counting cells instead of atoms is not sufficient, because OVN
users are adding hundreds or thousands of atoms to a single cell,
so they are largely different in size.
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Han Zhou <hzhou@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-09-28 13:17:21 +02:00
|
|
|
|
size_t n_atoms; /* Total number of ovsdb atoms in the database. */
|
|
|
|
|
|
2024-01-09 23:49:01 +01:00
|
|
|
|
bool read_only; /* If 'true', JSON-RPC clients are not allowed to change
|
|
|
|
|
* the data. */
|
|
|
|
|
|
2021-06-01 23:27:36 +02:00
|
|
|
|
/* Relay mode. */
|
ovsdb: relay: Add support for transaction forwarding.
Current version of ovsdb relay allows to scale out read-only
access to the primary database. However, many clients are not
read-only but read-mostly. For example, ovn-controller.
In order to scale out database access for this case ovsdb-server
need to process transactions that are not read-only. Relay is not
allowed to do that, i.e. not allowed to modify the database, but it
can act like a proxy and forward transactions that includes database
modifications to the primary server and forward replies back to a
client. At the same time it may serve read-only transactions and
monitor requests by itself greatly reducing the load on primary
server.
This configuration will slightly increase transaction latency, but
it's not very important for read-mostly use cases.
Implementation details:
With this change instead of creating a trigger to commit the
transaction, ovsdb-server will create a trigger for transaction
forwarding. Later, ovsdb_relay_run() will send all new transactions
to the relay source. Once transaction reply received from the
relay source, ovsdb-relay module will update the state of the
transaction forwarding with the reply. After that, trigger_run()
will complete the trigger and jsonrpc_server_run() will send the
reply back to the client. Since transaction reply from the relay
source will be received after all the updates, client will receive
all the updates before receiving the transaction reply as it is in
a normal scenario with other database models.
Acked-by: Mark D. Gray <mark.d.gray@redhat.com>
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
|
|
|
|
bool is_relay; /* True, if database is in relay mode. */
|
|
|
|
|
/* List that holds transactions waiting to be forwarded to the server. */
|
|
|
|
|
struct ovs_list txn_forward_new;
|
|
|
|
|
/* Hash map for transactions that are already sent and waits for reply. */
|
|
|
|
|
struct hmap txn_forward_sent;
|
ovsdb: Prepare snapshot JSON in a separate thread.
Conversion of the database data into JSON object, serialization
and destruction of that object are the most heavy operations
during the database compaction. If these operations are moved
to a separate thread, the main thread can continue processing
database requests in the meantime.
With this change, the compaction is split in 3 phases:
1. Initialization:
- Create a copy of the database.
- Remember current database index.
- Start a separate thread to convert a copy of the database
into serialized JSON object.
2. Wait:
- Continue normal operation until compaction thread is done.
- Meanwhile, compaction thread:
* Convert database copy to JSON.
* Serialize resulted JSON.
* Destroy original JSON object.
3. Finish:
- Destroy the database copy.
- Take the snapshot created by the thread.
- Write on disk.
The key for this schema to be fast is the ability to create
a shallow copy of the database. This doesn't take too much
time allowing the thread to do most of work.
Database copy is created and destroyed only by the main thread,
so there is no need for synchronization.
Such solution allows to reduce the time main thread is blocked
by compaction by 80-90%. For example, in ovn-heater tests
with 120 node density-heavy scenario, where compaction normally
takes 5-6 seconds at the end of a test, measured compaction
times was all below 1 second with the change applied. Also,
note that these measured times are the sum of phases 1 and 3,
so actual poll intervals are about half a second in this case.
Only implemented for raft storage for now. The implementation
for standalone databases can be added later by using a file
offset as a database index and copying newly added changes
from the old file to a new one during ovsdb_log_replace().
Reported-at: https://bugzilla.redhat.com/2069108
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-07-01 01:34:07 +02:00
|
|
|
|
|
|
|
|
|
/* Database compaction. */
|
|
|
|
|
struct ovsdb_compaction_state *snap_state;
|
2009-11-04 15:11:44 -08:00
|
|
|
|
};
|
|
|
|
|
|
2022-11-25 13:37:04 +01:00
|
|
|
|
/* Total number of 'weak reference' objects in all databases
|
|
|
|
|
* and transactions. */
|
|
|
|
|
extern size_t n_weak_refs;
|
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb *ovsdb_create(struct ovsdb_schema *, struct ovsdb_storage *);
|
2009-11-04 15:11:44 -08:00
|
|
|
|
void ovsdb_destroy(struct ovsdb *);
|
|
|
|
|
|
2023-03-27 21:42:59 +02:00
|
|
|
|
void ovsdb_no_data_conversion_disable(void);
|
|
|
|
|
bool ovsdb_conversion_with_no_data_supported(const struct ovsdb *);
|
|
|
|
|
|
2012-05-08 15:44:21 -07:00
|
|
|
|
void ovsdb_get_memory_usage(const struct ovsdb *, struct simap *usage);
|
|
|
|
|
|
2009-11-04 15:11:44 -08:00
|
|
|
|
struct ovsdb_table *ovsdb_get_table(const struct ovsdb *, const char *);
|
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct ovsdb_txn *ovsdb_execute_compose(
|
|
|
|
|
struct ovsdb *, const struct ovsdb_session *, const struct json *params,
|
|
|
|
|
bool read_only, const char *role, const char *id,
|
|
|
|
|
long long int elapsed_msec, long long int *timeout_msec,
|
ovsdb: relay: Add support for transaction forwarding.
Current version of ovsdb relay allows to scale out read-only
access to the primary database. However, many clients are not
read-only but read-mostly. For example, ovn-controller.
In order to scale out database access for this case ovsdb-server
need to process transactions that are not read-only. Relay is not
allowed to do that, i.e. not allowed to modify the database, but it
can act like a proxy and forward transactions that includes database
modifications to the primary server and forward replies back to a
client. At the same time it may serve read-only transactions and
monitor requests by itself greatly reducing the load on primary
server.
This configuration will slightly increase transaction latency, but
it's not very important for read-mostly use cases.
Implementation details:
With this change instead of creating a trigger to commit the
transaction, ovsdb-server will create a trigger for transaction
forwarding. Later, ovsdb_relay_run() will send all new transactions
to the relay source. Once transaction reply received from the
relay source, ovsdb-relay module will update the state of the
transaction forwarding with the reply. After that, trigger_run()
will complete the trigger and jsonrpc_server_run() will send the
reply back to the client. Since transaction reply from the relay
source will be received after all the updates, client will receive
all the updates before receiving the transaction reply as it is in
a normal scenario with other database models.
Acked-by: Mark D. Gray <mark.d.gray@redhat.com>
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
|
|
|
|
bool *durable, bool *forwarding_needed, struct json **);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
|
2011-07-26 10:24:17 -07:00
|
|
|
|
struct json *ovsdb_execute(struct ovsdb *, const struct ovsdb_session *,
|
2016-07-29 14:39:29 -07:00
|
|
|
|
const struct json *params, bool read_only,
|
2017-05-31 19:04:32 -04:00
|
|
|
|
const char *role, const char *id,
|
2009-11-04 15:11:44 -08:00
|
|
|
|
long long int elapsed_msec,
|
|
|
|
|
long long int *timeout_msec);
|
|
|
|
|
|
ovsdb-server: Reclaim heap memory after compaction.
Compaction happens at most once in 10 minutes. That is a big time
interval for a heavy loaded ovsdb-server in cluster mode.
In 10 minutes raft logs could grow up to tens of thousands of entries
with tens of gigabytes in total size.
While compaction cleans up raft log entries, the memory in many cases
is not returned to the system, but kept in the heap of running
ovsdb-server process, and it could stay in this condition for a really
long time. In the end one performance spike could lead to a fast
growth of the raft log and this memory will never (for a really long
time) be released to the system even if the database if empty.
Simple example how to reproduce with OVN sandbox:
1. make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered'
2. Run following script that creates 1 port group, adds 4000 acls and
removes all of that in the end:
# cat ../memory-test.sh
pg_name=my_port_group
export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off)
ovn-nbctl pg-add $pg_name
for i in $(seq 1 4000); do
echo "Iteration: $i"
ovn-nbctl --log acl-add $pg_name from-lport $i udp drop
done
ovn-nbctl acl-del $pg_name
ovn-nbctl pg-del $pg_name
ovs-appctl -t $(pwd)/sandbox/nb1 memory/show
ovn-appctl -t ovn-nbctl exit
---
3. Stopping one of Northbound DB servers:
ovs-appctl -t $(pwd)/sandbox/nb1 exit
Make sure that ovsdb-server didn't compact the database before
it was stopped. Now we have a db file on disk that contains
4000 fairly big transactions inside.
4. Trying to start same ovsdb-server with this file.
# cd sandbox && ovsdb-server <...> nb1.db
At this point ovsdb-server reads all the transactions from db
file and performs all of them as fast as it can one by one.
When it finishes this, raft log contains 4000 entries and
ovsdb-server consumes (on my system) ~13GB of memory while
database is empty. And libc will likely never return this memory
back to system, or, at least, will hold it for a really long time.
This patch adds a new command 'ovsdb-server/memory-trim-on-compaction'.
It's disabled by default, but once enabled, ovsdb-server will call
'malloc_trim(0)' after every successful compaction to try to return
unused heap memory back to system. This is glibc-specific, so we
need to detect function availability in a build time.
Disabled by default since it adds from 1% to 30% (depending on the
current state) to the snapshot creation time and, also, next memory
allocations will likely require requests to kernel and that might be
slower. Could be enabled by default later if considered broadly
beneficial.
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1888829
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-10-24 02:25:48 +02:00
|
|
|
|
struct ovsdb_error *ovsdb_snapshot(struct ovsdb *, bool trim_memory)
|
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
ovsdb: Prepare snapshot JSON in a separate thread.
Conversion of the database data into JSON object, serialization
and destruction of that object are the most heavy operations
during the database compaction. If these operations are moved
to a separate thread, the main thread can continue processing
database requests in the meantime.
With this change, the compaction is split in 3 phases:
1. Initialization:
- Create a copy of the database.
- Remember current database index.
- Start a separate thread to convert a copy of the database
into serialized JSON object.
2. Wait:
- Continue normal operation until compaction thread is done.
- Meanwhile, compaction thread:
* Convert database copy to JSON.
* Serialize resulted JSON.
* Destroy original JSON object.
3. Finish:
- Destroy the database copy.
- Take the snapshot created by the thread.
- Write on disk.
The key for this schema to be fast is the ability to create
a shallow copy of the database. This doesn't take too much
time allowing the thread to do most of work.
Database copy is created and destroyed only by the main thread,
so there is no need for synchronization.
Such solution allows to reduce the time main thread is blocked
by compaction by 80-90%. For example, in ovn-heater tests
with 120 node density-heavy scenario, where compaction normally
takes 5-6 seconds at the end of a test, measured compaction
times was all below 1 second with the change applied. Also,
note that these measured times are the sum of phases 1 and 3,
so actual poll intervals are about half a second in this case.
Only implemented for raft storage for now. The implementation
for standalone databases can be added later by using a file
offset as a database index and copying newly added changes
from the old file to a new one during ovsdb_log_replace().
Reported-at: https://bugzilla.redhat.com/2069108
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-07-01 01:34:07 +02:00
|
|
|
|
void ovsdb_snapshot_wait(struct ovsdb *);
|
|
|
|
|
bool ovsdb_snapshot_in_progress(struct ovsdb *);
|
|
|
|
|
bool ovsdb_snapshot_ready(struct ovsdb *);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
|
|
|
|
|
void ovsdb_replace(struct ovsdb *dst, struct ovsdb *src);
|
|
|
|
|
|
2009-11-04 15:11:44 -08:00
|
|
|
|
#endif /* ovsdb/ovsdb.h */
|