2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 01:51:26 +00:00
ovs/ovsdb/ovsdb.h

166 lines
5.9 KiB
C
Raw Permalink Normal View History

/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
2009-11-04 15:11:44 -08:00
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef OVSDB_OVSDB_H
#define OVSDB_OVSDB_H 1
#include "compiler.h"
#include "openvswitch/hmap.h"
#include "openvswitch/list.h"
#include "openvswitch/shash.h"
#include "openvswitch/uuid.h"
#include "ovs-thread.h"
2009-11-04 15:11:44 -08:00
struct json;
struct ovsdb_log;
struct ovsdb_session;
struct ovsdb_txn;
struct simap;
2009-11-04 15:11:44 -08:00
/* Database schema. */
struct ovsdb_schema {
char *name;
char *version;
char *cksum;
2009-11-04 15:11:44 -08:00
struct shash tables; /* Contains "struct ovsdb_table_schema *"s. */
};
struct ovsdb_schema *ovsdb_schema_create(const char *name,
const char *version,
const char *cksum);
struct ovsdb_schema *ovsdb_schema_clone(const struct ovsdb_schema *);
2009-11-04 15:11:44 -08:00
void ovsdb_schema_destroy(struct ovsdb_schema *);
struct ovsdb_error *ovsdb_schema_from_file(const char *file_name,
struct ovsdb_schema **)
OVS_WARN_UNUSED_RESULT;
struct ovsdb_error *ovsdb_schema_from_json(const struct json *,
2009-11-04 15:11:44 -08:00
struct ovsdb_schema **)
OVS_WARN_UNUSED_RESULT;
2009-11-04 15:11:44 -08:00
struct json *ovsdb_schema_to_json(const struct ovsdb_schema *);
bool ovsdb_schema_equal(const struct ovsdb_schema *,
const struct ovsdb_schema *);
struct ovsdb_error *ovsdb_schema_check_for_ephemeral_columns(
const struct ovsdb_schema *) OVS_WARN_UNUSED_RESULT;
void ovsdb_schema_persist_ephemeral_columns(struct ovsdb_schema *,
const char *filename);
struct ovsdb_version {
unsigned int x;
unsigned int y;
unsigned int z;
};
bool ovsdb_parse_version(const char *, struct ovsdb_version *);
bool ovsdb_is_valid_version(const char *);
2009-11-04 15:11:44 -08:00
/* Database. */
struct ovsdb_txn_history_node {
struct ovs_list node; /* Element in struct ovsdb's txn_history list */
struct ovsdb_txn *txn;
};
ovsdb: Prepare snapshot JSON in a separate thread. Conversion of the database data into JSON object, serialization and destruction of that object are the most heavy operations during the database compaction. If these operations are moved to a separate thread, the main thread can continue processing database requests in the meantime. With this change, the compaction is split in 3 phases: 1. Initialization: - Create a copy of the database. - Remember current database index. - Start a separate thread to convert a copy of the database into serialized JSON object. 2. Wait: - Continue normal operation until compaction thread is done. - Meanwhile, compaction thread: * Convert database copy to JSON. * Serialize resulted JSON. * Destroy original JSON object. 3. Finish: - Destroy the database copy. - Take the snapshot created by the thread. - Write on disk. The key for this schema to be fast is the ability to create a shallow copy of the database. This doesn't take too much time allowing the thread to do most of work. Database copy is created and destroyed only by the main thread, so there is no need for synchronization. Such solution allows to reduce the time main thread is blocked by compaction by 80-90%. For example, in ovn-heater tests with 120 node density-heavy scenario, where compaction normally takes 5-6 seconds at the end of a test, measured compaction times was all below 1 second with the change applied. Also, note that these measured times are the sum of phases 1 and 3, so actual poll intervals are about half a second in this case. Only implemented for raft storage for now. The implementation for standalone databases can be added later by using a file offset as a database index and copying newly added changes from the old file to a new one during ovsdb_log_replace(). Reported-at: https://bugzilla.redhat.com/2069108 Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-07-01 01:34:07 +02:00
struct ovsdb_compaction_state {
pthread_t thread; /* Thread handle. */
struct ovsdb *db; /* Copy of a database data to compact. */
struct json *data; /* 'db' as a serialized json. */
struct json *schema; /* 'db' schema json. */
uint64_t applied_index; /* Last applied index reported by the storage
* at the moment of a database copy. */
/* Completion signaling. */
struct seq *done;
uint64_t seqno;
uint64_t init_time; /* Time spent by the main thread preparing. */
uint64_t thread_time; /* Time spent for compaction by the thread. */
};
2009-11-04 15:11:44 -08:00
struct ovsdb {
char *name;
2009-11-04 15:11:44 -08:00
struct ovsdb_schema *schema;
struct ovsdb_storage *storage; /* If nonnull, log for transactions. */
struct uuid prereq;
struct ovs_list monitors; /* Contains "struct ovsdb_monitor"s. */
2009-11-04 15:11:44 -08:00
struct shash tables; /* Contains "struct ovsdb_table *"s. */
/* Triggers. */
struct ovs_list triggers; /* Contains "struct ovsdb_trigger"s. */
2009-11-04 15:11:44 -08:00
bool run_triggers;
raft: Avoid busy loop during leader election. When a server doesn't see a leader yet, e.g. during leader re-election, if a transaction comes from a client, it will cause 100% CPU busy loop. With debug log enabled it is like: 2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 ... The problem is that in ovsdb_trigger_try(), all cluster errors are treated as temporary error and retry immediately. This patch fixes it by introducing 'run_triggers_now', which tells if a retry is needed immediately. When the cluster error is with detail 'not leader', we don't immediately retry, but will wait for the next poll event to trigger the retry. When 'not leader' status changes, there must be a event, i.e. raft RPC that changes the status, so the trigger is guaranteed to be triggered, without busy loop. Signed-off-by: Han Zhou <hzhou@ovn.org> Signed-off-by: Ben Pfaff <blp@ovn.org>
2020-02-28 18:07:07 -08:00
bool run_triggers_now;
struct ovsdb_table *rbac_role;
/* History trasanctions for incremental monitor transfer. */
bool need_txn_history; /* Need to maintain history of transactions. */
unsigned int n_txn_history; /* Current number of history transactions. */
ovsdb: Don't let transaction history grow larger than the database. If user frequently changes a lot of rows in a database, transaction history could grow way larger than the database itself. This wastes a lot of memory and also makes monitor_cond_since slower than usual monotor_cond if the transaction id is old enough, because re-construction of the changes from a history is slower than just creation of initial database snapshot. This is also the case if user deleted a lot of data, so transaction history still holds all of it while the database itself doesn't. In case of current lb-per-service model in ovn-kubernetes, each load-balancer is added to every logical switch/router. Such a transaction touches more than a half of a OVN_Northbound database. And each of these transactions is added to the transaction history. Since transaction history depth is 100, in worst case scenario, it will hold 100 copies of a database increasing memory consumption dramatically. In tests with 3000 LBs and 120 LSs, memory goes up to 3 GB, while holding at 30 MB if transaction history disabled in the code. Fixing that by keeping count of the number of ovsdb_atom's in the database and not allowing the total number of atoms in transaction history to grow larger than this value. Counting atoms is fairly cheap because we don't need to iterate over them, so it doesn't have significant performance impact. It would be ideal to measure the size of individual atoms, but that will hit the performance. Counting cells instead of atoms is not sufficient, because OVN users are adding hundreds or thousands of atoms to a single cell, so they are largely different in size. Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Han Zhou <hzhou@ovn.org> Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-09-28 13:17:21 +02:00
unsigned int n_txn_history_atoms; /* Total number of atoms in history. */
struct ovs_list txn_history; /* Contains "struct ovsdb_txn_history_node. */
ovsdb: New ovsdb 'relay' service model. New database service model 'relay' that is needed to scale out read-mostly database access, e.g. ovn-controller connections to OVN_Southbound. In this service model ovsdb-server connects to existing OVSDB server and maintains in-memory copy of the database. It serves read-only transactions and monitor requests by its own, but forwards write transactions to the relay source. Key differences from the active-backup replication: - support for "write" transactions (next commit). - no on-disk storage. (probably, faster operation) - support for multiple remotes (connect to the clustered db). - doesn't try to keep connection as long as possible, but faster reconnects to other remotes to avoid missing updates. - No need to know the complete database schema beforehand, only the schema name. - can be used along with other standalone and clustered databases by the same ovsdb-server process. (doesn't turn the whole jsonrpc server to read-only mode) - supports modern version of monitors (monitor_cond_since), because based on ovsdb-cs. - could be chained, i.e. multiple relays could be connected one to another in a row or in a tree-like form. - doesn't increase availability. - cannot be converted to other service models or become a main active server. Some performance test results can be found here: https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385825.html Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-06-01 23:27:36 +02:00
ovsdb: Don't let transaction history grow larger than the database. If user frequently changes a lot of rows in a database, transaction history could grow way larger than the database itself. This wastes a lot of memory and also makes monitor_cond_since slower than usual monotor_cond if the transaction id is old enough, because re-construction of the changes from a history is slower than just creation of initial database snapshot. This is also the case if user deleted a lot of data, so transaction history still holds all of it while the database itself doesn't. In case of current lb-per-service model in ovn-kubernetes, each load-balancer is added to every logical switch/router. Such a transaction touches more than a half of a OVN_Northbound database. And each of these transactions is added to the transaction history. Since transaction history depth is 100, in worst case scenario, it will hold 100 copies of a database increasing memory consumption dramatically. In tests with 3000 LBs and 120 LSs, memory goes up to 3 GB, while holding at 30 MB if transaction history disabled in the code. Fixing that by keeping count of the number of ovsdb_atom's in the database and not allowing the total number of atoms in transaction history to grow larger than this value. Counting atoms is fairly cheap because we don't need to iterate over them, so it doesn't have significant performance impact. It would be ideal to measure the size of individual atoms, but that will hit the performance. Counting cells instead of atoms is not sufficient, because OVN users are adding hundreds or thousands of atoms to a single cell, so they are largely different in size. Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Han Zhou <hzhou@ovn.org> Acked-by: Dumitru Ceara <dceara@redhat.com>
2021-09-28 13:17:21 +02:00
size_t n_atoms; /* Total number of ovsdb atoms in the database. */
bool read_only; /* If 'true', JSON-RPC clients are not allowed to change
* the data. */
ovsdb: New ovsdb 'relay' service model. New database service model 'relay' that is needed to scale out read-mostly database access, e.g. ovn-controller connections to OVN_Southbound. In this service model ovsdb-server connects to existing OVSDB server and maintains in-memory copy of the database. It serves read-only transactions and monitor requests by its own, but forwards write transactions to the relay source. Key differences from the active-backup replication: - support for "write" transactions (next commit). - no on-disk storage. (probably, faster operation) - support for multiple remotes (connect to the clustered db). - doesn't try to keep connection as long as possible, but faster reconnects to other remotes to avoid missing updates. - No need to know the complete database schema beforehand, only the schema name. - can be used along with other standalone and clustered databases by the same ovsdb-server process. (doesn't turn the whole jsonrpc server to read-only mode) - supports modern version of monitors (monitor_cond_since), because based on ovsdb-cs. - could be chained, i.e. multiple relays could be connected one to another in a row or in a tree-like form. - doesn't increase availability. - cannot be converted to other service models or become a main active server. Some performance test results can be found here: https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385825.html Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-06-01 23:27:36 +02:00
/* Relay mode. */
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
bool is_relay; /* True, if database is in relay mode. */
/* List that holds transactions waiting to be forwarded to the server. */
struct ovs_list txn_forward_new;
/* Hash map for transactions that are already sent and waits for reply. */
struct hmap txn_forward_sent;
ovsdb: Prepare snapshot JSON in a separate thread. Conversion of the database data into JSON object, serialization and destruction of that object are the most heavy operations during the database compaction. If these operations are moved to a separate thread, the main thread can continue processing database requests in the meantime. With this change, the compaction is split in 3 phases: 1. Initialization: - Create a copy of the database. - Remember current database index. - Start a separate thread to convert a copy of the database into serialized JSON object. 2. Wait: - Continue normal operation until compaction thread is done. - Meanwhile, compaction thread: * Convert database copy to JSON. * Serialize resulted JSON. * Destroy original JSON object. 3. Finish: - Destroy the database copy. - Take the snapshot created by the thread. - Write on disk. The key for this schema to be fast is the ability to create a shallow copy of the database. This doesn't take too much time allowing the thread to do most of work. Database copy is created and destroyed only by the main thread, so there is no need for synchronization. Such solution allows to reduce the time main thread is blocked by compaction by 80-90%. For example, in ovn-heater tests with 120 node density-heavy scenario, where compaction normally takes 5-6 seconds at the end of a test, measured compaction times was all below 1 second with the change applied. Also, note that these measured times are the sum of phases 1 and 3, so actual poll intervals are about half a second in this case. Only implemented for raft storage for now. The implementation for standalone databases can be added later by using a file offset as a database index and copying newly added changes from the old file to a new one during ovsdb_log_replace(). Reported-at: https://bugzilla.redhat.com/2069108 Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-07-01 01:34:07 +02:00
/* Database compaction. */
struct ovsdb_compaction_state *snap_state;
2009-11-04 15:11:44 -08:00
};
/* Total number of 'weak reference' objects in all databases
* and transactions. */
extern size_t n_weak_refs;
struct ovsdb *ovsdb_create(struct ovsdb_schema *, struct ovsdb_storage *);
2009-11-04 15:11:44 -08:00
void ovsdb_destroy(struct ovsdb *);
ovsdb: Perform conversion with no data for clustered databases. Currently, database schema conversion in case of clustered database produces a transaction record with both new schema and converted database data. So, the sequence of events is following: 1. Get the new schema. 2. Convert the database to a new schema. 3. Translate the newly converted database into JSON. 4. Write the schema + data JSON to the storage. 5. Destroy converted version of a database. 6. Read schema + data JSON from the storage and parse. 7. Create a new database from a parsed database data. 8. Replace current database with the new one. Most of these steps are very computationally expensive. Also, conversion to/from JSON is much more expensive than direct database conversion with ovsdb_convert() that can make use of shallow data copies. Instead of doing all that, let's make use of previously introduced ability to not write the converted data into the storage. The process will look like this then: 1. Get the new schema. 2. Convert the database to a new schema (to verify that it is possible). 3. Write the schema to the storage. 4. Destroy converted version of a database. 5. Read the new schema from the storage and parse. 6. Convert the database to a new schema. 7. Replace current database with the new one. Most of the operations here are performed on the small schema object, instead of the actual database data. Two remaining data operations (actual conversion) are noticeably faster than conversion to/from JSON due to reference counting and shallow data copies. Steps 4-6 can be optimized later to not convert twice on the process that initiates the conversion. The change results in following performance improvements in conversion of OVN_Southbound database schema from version 20.23.0 to 20.27.0 (measured on a single-server RAFT cluster with no clients): | Before | After +---------+-------------------+---------+------------------ DB size | Total | Max poll interval | Total | Max poll interval --------+---------+-------------------+---------+------------------ 542 MB | 47 sec. | 26 sec. | 15 sec. | 10 sec. 225 MB | 19 sec. | 10 sec. | 6 sec. | 4.5 sec. 542 MB database had 19.5 M atoms, 225 MB database had 7.5 M atoms. Overall performance improvement is about 3x. Also, note that before this change database conversion basically doubles the database file on disk. Now it only writes a small schema JSON. Since the change requires backward-incompatible database file format changes, documentation is updated on how to perform an upgrade. Handled the same way as we did for the previous incompatible format change in 2.15 (column diffs). Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-December/052140.html Reviewed-by: Simon Horman <simon.horman@corigine.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-03-27 21:42:59 +02:00
void ovsdb_no_data_conversion_disable(void);
bool ovsdb_conversion_with_no_data_supported(const struct ovsdb *);
void ovsdb_get_memory_usage(const struct ovsdb *, struct simap *usage);
2009-11-04 15:11:44 -08:00
struct ovsdb_table *ovsdb_get_table(const struct ovsdb *, const char *);
struct ovsdb_txn *ovsdb_execute_compose(
struct ovsdb *, const struct ovsdb_session *, const struct json *params,
bool read_only, const char *role, const char *id,
long long int elapsed_msec, long long int *timeout_msec,
ovsdb: relay: Add support for transaction forwarding. Current version of ovsdb relay allows to scale out read-only access to the primary database. However, many clients are not read-only but read-mostly. For example, ovn-controller. In order to scale out database access for this case ovsdb-server need to process transactions that are not read-only. Relay is not allowed to do that, i.e. not allowed to modify the database, but it can act like a proxy and forward transactions that includes database modifications to the primary server and forward replies back to a client. At the same time it may serve read-only transactions and monitor requests by itself greatly reducing the load on primary server. This configuration will slightly increase transaction latency, but it's not very important for read-mostly use cases. Implementation details: With this change instead of creating a trigger to commit the transaction, ovsdb-server will create a trigger for transaction forwarding. Later, ovsdb_relay_run() will send all new transactions to the relay source. Once transaction reply received from the relay source, ovsdb-relay module will update the state of the transaction forwarding with the reply. After that, trigger_run() will complete the trigger and jsonrpc_server_run() will send the reply back to the client. Since transaction reply from the relay source will be received after all the updates, client will receive all the updates before receiving the transaction reply as it is in a normal scenario with other database models. Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-15 19:05:40 +02:00
bool *durable, bool *forwarding_needed, struct json **);
struct json *ovsdb_execute(struct ovsdb *, const struct ovsdb_session *,
const struct json *params, bool read_only,
const char *role, const char *id,
2009-11-04 15:11:44 -08:00
long long int elapsed_msec,
long long int *timeout_msec);
ovsdb-server: Reclaim heap memory after compaction. Compaction happens at most once in 10 minutes. That is a big time interval for a heavy loaded ovsdb-server in cluster mode. In 10 minutes raft logs could grow up to tens of thousands of entries with tens of gigabytes in total size. While compaction cleans up raft log entries, the memory in many cases is not returned to the system, but kept in the heap of running ovsdb-server process, and it could stay in this condition for a really long time. In the end one performance spike could lead to a fast growth of the raft log and this memory will never (for a really long time) be released to the system even if the database if empty. Simple example how to reproduce with OVN sandbox: 1. make sandbox SANDBOXFLAGS='--nbdb-model=clustered --sbdb-model=clustered' 2. Run following script that creates 1 port group, adds 4000 acls and removes all of that in the end: # cat ../memory-test.sh pg_name=my_port_group export OVN_NB_DAEMON=$(ovn-nbctl --pidfile --detach --log-file -vsocket_util:off) ovn-nbctl pg-add $pg_name for i in $(seq 1 4000); do echo "Iteration: $i" ovn-nbctl --log acl-add $pg_name from-lport $i udp drop done ovn-nbctl acl-del $pg_name ovn-nbctl pg-del $pg_name ovs-appctl -t $(pwd)/sandbox/nb1 memory/show ovn-appctl -t ovn-nbctl exit --- 3. Stopping one of Northbound DB servers: ovs-appctl -t $(pwd)/sandbox/nb1 exit Make sure that ovsdb-server didn't compact the database before it was stopped. Now we have a db file on disk that contains 4000 fairly big transactions inside. 4. Trying to start same ovsdb-server with this file. # cd sandbox && ovsdb-server <...> nb1.db At this point ovsdb-server reads all the transactions from db file and performs all of them as fast as it can one by one. When it finishes this, raft log contains 4000 entries and ovsdb-server consumes (on my system) ~13GB of memory while database is empty. And libc will likely never return this memory back to system, or, at least, will hold it for a really long time. This patch adds a new command 'ovsdb-server/memory-trim-on-compaction'. It's disabled by default, but once enabled, ovsdb-server will call 'malloc_trim(0)' after every successful compaction to try to return unused heap memory back to system. This is glibc-specific, so we need to detect function availability in a build time. Disabled by default since it adds from 1% to 30% (depending on the current state) to the snapshot creation time and, also, next memory allocations will likely require requests to kernel and that might be slower. Could be enabled by default later if considered broadly beneficial. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1888829 Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-10-24 02:25:48 +02:00
struct ovsdb_error *ovsdb_snapshot(struct ovsdb *, bool trim_memory)
OVS_WARN_UNUSED_RESULT;
ovsdb: Prepare snapshot JSON in a separate thread. Conversion of the database data into JSON object, serialization and destruction of that object are the most heavy operations during the database compaction. If these operations are moved to a separate thread, the main thread can continue processing database requests in the meantime. With this change, the compaction is split in 3 phases: 1. Initialization: - Create a copy of the database. - Remember current database index. - Start a separate thread to convert a copy of the database into serialized JSON object. 2. Wait: - Continue normal operation until compaction thread is done. - Meanwhile, compaction thread: * Convert database copy to JSON. * Serialize resulted JSON. * Destroy original JSON object. 3. Finish: - Destroy the database copy. - Take the snapshot created by the thread. - Write on disk. The key for this schema to be fast is the ability to create a shallow copy of the database. This doesn't take too much time allowing the thread to do most of work. Database copy is created and destroyed only by the main thread, so there is no need for synchronization. Such solution allows to reduce the time main thread is blocked by compaction by 80-90%. For example, in ovn-heater tests with 120 node density-heavy scenario, where compaction normally takes 5-6 seconds at the end of a test, measured compaction times was all below 1 second with the change applied. Also, note that these measured times are the sum of phases 1 and 3, so actual poll intervals are about half a second in this case. Only implemented for raft storage for now. The implementation for standalone databases can be added later by using a file offset as a database index and copying newly added changes from the old file to a new one during ovsdb_log_replace(). Reported-at: https://bugzilla.redhat.com/2069108 Acked-by: Dumitru Ceara <dceara@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-07-01 01:34:07 +02:00
void ovsdb_snapshot_wait(struct ovsdb *);
bool ovsdb_snapshot_in_progress(struct ovsdb *);
bool ovsdb_snapshot_ready(struct ovsdb *);
void ovsdb_replace(struct ovsdb *dst, struct ovsdb *src);
2009-11-04 15:11:44 -08:00
#endif /* ovsdb/ovsdb.h */