2017-12-31 21:15:58 -08:00
|
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2017, 2018 Nicira, Inc.
|
|
|
|
|
*
|
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
|
* You may obtain a copy of the License at:
|
|
|
|
|
*
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
*
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
|
* limitations under the License.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#ifndef RAFT_PRIVATE_H
|
|
|
|
|
#define RAFT_PRIVATE_H 1
|
|
|
|
|
|
|
|
|
|
/* Data structures for use internally within the Raft implementation. */
|
|
|
|
|
|
|
|
|
|
#include "raft.h"
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
#include "openvswitch/hmap.h"
|
|
|
|
|
#include "openvswitch/uuid.h"
|
|
|
|
|
#include "sset.h"
|
|
|
|
|
|
|
|
|
|
struct ds;
|
|
|
|
|
struct ovsdb_parser;
|
2020-05-22 22:36:27 +02:00
|
|
|
|
struct raft_install_snapshot_request;
|
2017-12-31 21:15:58 -08:00
|
|
|
|
|
|
|
|
|
/* Formatting server IDs and cluster IDs for use in human-readable logs. Do
|
|
|
|
|
* not use these in cases where the whole server or cluster ID is needed; use
|
|
|
|
|
* UUID_FMT and UUID_ARGS in that case.*/
|
|
|
|
|
|
|
|
|
|
#define SID_FMT "%04x"
|
|
|
|
|
#define SID_ARGS(SID) uuid_prefix(SID, 4)
|
|
|
|
|
#define SID_LEN 4
|
|
|
|
|
|
|
|
|
|
#define CID_FMT "%04x"
|
|
|
|
|
#define CID_ARGS(CID) uuid_prefix(CID, 4)
|
|
|
|
|
#define CID_LEN 4
|
|
|
|
|
|
|
|
|
|
struct ovsdb_error *raft_address_validate(const char *address)
|
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
struct ovsdb_error *raft_addresses_from_json(const struct json *,
|
|
|
|
|
struct sset *addresses)
|
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
struct json *raft_addresses_to_json(const struct sset *addresses);
|
|
|
|
|
|
|
|
|
|
char *raft_address_to_nickname(const char *address, const struct uuid *sid);
|
|
|
|
|
|
|
|
|
|
enum raft_server_phase {
|
|
|
|
|
RAFT_PHASE_STABLE, /* Not being changed. */
|
|
|
|
|
|
|
|
|
|
/* Phases for servers being added. */
|
|
|
|
|
RAFT_PHASE_CATCHUP, /* Populating new server's log. */
|
|
|
|
|
RAFT_PHASE_CAUGHT_UP, /* Waiting for prev configuration to commit. */
|
|
|
|
|
RAFT_PHASE_COMMITTING, /* Waiting for new configuration to commit. */
|
|
|
|
|
|
|
|
|
|
/* Phases for servers to be removed. */
|
|
|
|
|
RAFT_PHASE_REMOVE, /* To be removed. */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const char *raft_server_phase_to_string(enum raft_server_phase);
|
|
|
|
|
|
|
|
|
|
/* Information about a server in a Raft cluster.
|
|
|
|
|
*
|
|
|
|
|
* Often within struct raft's 'servers' or 'add_servers' hmap. */
|
|
|
|
|
struct raft_server {
|
|
|
|
|
struct hmap_node hmap_node; /* Hashed based on 'sid'. */
|
|
|
|
|
|
|
|
|
|
struct uuid sid; /* Unique Server ID. */
|
|
|
|
|
char *address; /* "(tcp|ssl):1.2.3.4:5678" */
|
|
|
|
|
char *nickname; /* "1ab3(s3)" */
|
|
|
|
|
|
|
|
|
|
/* Volatile state on candidates. Reinitialized at start of election. */
|
|
|
|
|
struct uuid vote; /* Server ID of vote, or all-zeros. */
|
|
|
|
|
|
|
|
|
|
/* Volatile state on leaders. Reinitialized after election. */
|
|
|
|
|
uint64_t next_index; /* Index of next log entry to send this server. */
|
|
|
|
|
uint64_t match_index; /* Index of max log entry server known to have. */
|
|
|
|
|
enum raft_server_phase phase;
|
2019-08-19 09:29:58 -07:00
|
|
|
|
bool replied; /* Reply to append_request was received from this
|
|
|
|
|
node during current election_timeout interval.
|
|
|
|
|
*/
|
2020-10-20 18:22:25 +02:00
|
|
|
|
/* install_snapshot_request has been sent, but there is no response yet. */
|
|
|
|
|
bool install_snapshot_request_in_progress;
|
2020-05-22 22:36:27 +02:00
|
|
|
|
|
2017-12-31 21:15:58 -08:00
|
|
|
|
/* For use in adding and removing servers: */
|
|
|
|
|
struct uuid requester_sid; /* Nonzero if requested via RPC. */
|
|
|
|
|
struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */
|
2020-11-25 11:12:59 +01:00
|
|
|
|
|
|
|
|
|
long long int last_msg_ts; /* Last received msg timestamp in ms. */
|
2017-12-31 21:15:58 -08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void raft_server_destroy(struct raft_server *);
|
|
|
|
|
void raft_servers_destroy(struct hmap *servers);
|
|
|
|
|
struct raft_server *raft_server_add(struct hmap *servers,
|
|
|
|
|
const struct uuid *sid,
|
|
|
|
|
const char *address);
|
|
|
|
|
struct raft_server *raft_server_find(const struct hmap *servers,
|
|
|
|
|
const struct uuid *sid);
|
|
|
|
|
const char *raft_servers_get_nickname__(const struct hmap *servers,
|
|
|
|
|
const struct uuid *sid);
|
|
|
|
|
const char *raft_servers_get_nickname(const struct hmap *servers,
|
|
|
|
|
const struct uuid *sid,
|
|
|
|
|
char buf[SID_LEN + 1], size_t bufsize);
|
|
|
|
|
struct ovsdb_error *raft_servers_from_json(const struct json *,
|
|
|
|
|
struct hmap *servers)
|
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
struct ovsdb_error *raft_servers_validate_json(const struct json *);
|
|
|
|
|
OVS_WARN_UNUSED_RESULT
|
|
|
|
|
struct json *raft_servers_to_json(const struct hmap *servers);
|
|
|
|
|
void raft_servers_format(const struct hmap *servers, struct ds *ds);
|
|
|
|
|
|
|
|
|
|
/* A raft_entry is an in-memory data structure that represents a Raft log
|
|
|
|
|
* entry. */
|
|
|
|
|
struct raft_entry {
|
|
|
|
|
uint64_t term;
|
raft: Don't keep full json objects in memory if no longer needed.
Raft log entries (and raft database snapshot) contains json objects
of the data. Follower receives append requests with data that gets
parsed and added to the raft log. Leader receives execution requests,
parses data out of them and adds to the log. In both cases, later
ovsdb-server reads the log with ovsdb_storage_read(), constructs
transaction and updates the database. On followers these json objects
in common case are never used again. Leader may use them to send
append requests or snapshot installation requests to followers.
However, all these operations (except for ovsdb_storage_read()) are
just serializing the json in order to send it over the network.
Json objects are significantly larger than their serialized string
representation. For example, the snapshot of the database from one of
the ovn-heater scale tests takes 270 MB as a string, but 1.6 GB as
a json object from the total 3.8 GB consumed by ovsdb-server process.
ovsdb_storage_read() for a given raft entry happens only once in a
lifetime, so after this call, we can serialize the json object, store
the string representation and free the actual json object that ovsdb
will never need again. This can save a lot of memory and can also
save serialization time, because each raft entry for append requests
and snapshot installation requests serialized only once instead of
doing that every time such request needs to be sent.
JSON_SERIALIZED_OBJECT can be used in order to seamlessly integrate
pre-serialized data into raft_header and similar json objects.
One major special case is creation of a database snapshot.
Snapshot installation request received over the network will be parsed
and read by ovsdb-server just like any other raft log entry. However,
snapshots created locally with raft_store_snapshot() will never be
read back, because they reflect the current state of the database,
hence already applied. For this case we can free the json object
right after writing snapshot on disk.
Tests performed with ovn-heater on 60 node density-light scenario,
where on-disk database goes up to 97 MB, shows average memory
consumption of ovsdb-server Southbound DB processes decreased by 58%
(from 602 MB to 256 MB per process) and peak memory consumption
decreased by 40% (from 1288 MB to 771 MB).
Test with 120 nodes on density-heavy scenario with 270 MB on-disk
database shows 1.5 GB memory consumption decrease as expected.
Also, total CPU time consumed by the Southbound DB process reduced
from 296 to 256 minutes. Number of unreasonably long poll intervals
reduced from 2896 down to 1934.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-24 21:00:38 +02:00
|
|
|
|
struct {
|
|
|
|
|
struct json *full_json; /* Fully parsed JSON object. */
|
|
|
|
|
struct json *serialized; /* JSON_SERIALIZED_OBJECT version of data. */
|
|
|
|
|
} data;
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct uuid eid;
|
|
|
|
|
struct json *servers;
|
2019-08-19 09:30:00 -07:00
|
|
|
|
uint64_t election_timer;
|
2017-12-31 21:15:58 -08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void raft_entry_clone(struct raft_entry *, const struct raft_entry *);
|
|
|
|
|
void raft_entry_uninit(struct raft_entry *);
|
|
|
|
|
struct json *raft_entry_to_json(const struct raft_entry *);
|
2025-06-24 21:54:33 +02:00
|
|
|
|
struct ovsdb_error *raft_entry_from_json(const struct json *,
|
|
|
|
|
struct raft_entry *)
|
2017-12-31 21:15:58 -08:00
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
bool raft_entry_equals(const struct raft_entry *, const struct raft_entry *);
|
raft: Don't keep full json objects in memory if no longer needed.
Raft log entries (and raft database snapshot) contains json objects
of the data. Follower receives append requests with data that gets
parsed and added to the raft log. Leader receives execution requests,
parses data out of them and adds to the log. In both cases, later
ovsdb-server reads the log with ovsdb_storage_read(), constructs
transaction and updates the database. On followers these json objects
in common case are never used again. Leader may use them to send
append requests or snapshot installation requests to followers.
However, all these operations (except for ovsdb_storage_read()) are
just serializing the json in order to send it over the network.
Json objects are significantly larger than their serialized string
representation. For example, the snapshot of the database from one of
the ovn-heater scale tests takes 270 MB as a string, but 1.6 GB as
a json object from the total 3.8 GB consumed by ovsdb-server process.
ovsdb_storage_read() for a given raft entry happens only once in a
lifetime, so after this call, we can serialize the json object, store
the string representation and free the actual json object that ovsdb
will never need again. This can save a lot of memory and can also
save serialization time, because each raft entry for append requests
and snapshot installation requests serialized only once instead of
doing that every time such request needs to be sent.
JSON_SERIALIZED_OBJECT can be used in order to seamlessly integrate
pre-serialized data into raft_header and similar json objects.
One major special case is creation of a database snapshot.
Snapshot installation request received over the network will be parsed
and read by ovsdb-server just like any other raft log entry. However,
snapshots created locally with raft_store_snapshot() will never be
read back, because they reflect the current state of the database,
hence already applied. For this case we can free the json object
right after writing snapshot on disk.
Tests performed with ovn-heater on 60 node density-light scenario,
where on-disk database goes up to 97 MB, shows average memory
consumption of ovsdb-server Southbound DB processes decreased by 58%
(from 602 MB to 256 MB per process) and peak memory consumption
decreased by 40% (from 1288 MB to 771 MB).
Test with 120 nodes on density-heavy scenario with 270 MB on-disk
database shows 1.5 GB memory consumption decrease as expected.
Also, total CPU time consumed by the Southbound DB process reduced
from 296 to 256 minutes. Number of unreasonably long poll intervals
reduced from 2896 down to 1934.
Acked-by: Dumitru Ceara <dceara@redhat.com>
Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-24 21:00:38 +02:00
|
|
|
|
bool raft_entry_has_data(const struct raft_entry *);
|
|
|
|
|
void raft_entry_set_parsed_data(struct raft_entry *, const struct json *);
|
|
|
|
|
void raft_entry_set_parsed_data_nocopy(struct raft_entry *, struct json *);
|
|
|
|
|
struct json *raft_entry_steal_parsed_data(struct raft_entry *)
|
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
const struct json *raft_entry_get_parsed_data(const struct raft_entry *);
|
|
|
|
|
const struct json *raft_entry_get_serialized_data(const struct raft_entry *);
|
2017-12-31 21:15:58 -08:00
|
|
|
|
|
|
|
|
|
/* On disk data serialization and deserialization. */
|
|
|
|
|
|
|
|
|
|
/* First record in a Raft log. */
|
|
|
|
|
struct raft_header {
|
|
|
|
|
/* All servers. */
|
|
|
|
|
struct uuid sid; /* Server ID. */
|
|
|
|
|
struct uuid cid; /* Cluster ID. May be zero if 'joining'. */
|
|
|
|
|
char *name; /* Database name. */
|
|
|
|
|
char *local_address; /* Address for Raft server to listen. */
|
|
|
|
|
bool joining; /* True iff cluster not joined yet. */
|
|
|
|
|
|
|
|
|
|
/* Only for servers that haven't joined the cluster yet. */
|
|
|
|
|
struct sset remote_addresses; /* Address of other Raft servers. */
|
|
|
|
|
|
|
|
|
|
/* Only for servers that have joined the cluster. */
|
|
|
|
|
uint64_t snap_index; /* Snapshot's index. */
|
|
|
|
|
struct raft_entry snap; /* Snapshot. */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void raft_header_uninit(struct raft_header *);
|
|
|
|
|
struct ovsdb_error *raft_header_from_json(struct raft_header *,
|
|
|
|
|
const struct json *)
|
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
struct json *raft_header_to_json(const struct raft_header *);
|
|
|
|
|
|
|
|
|
|
enum raft_record_type {
|
|
|
|
|
/* Record types that match those in the Raft specification. */
|
|
|
|
|
RAFT_REC_ENTRY, /* A log entry. */
|
|
|
|
|
RAFT_REC_TERM, /* A new term. */
|
|
|
|
|
RAFT_REC_VOTE, /* A vote. */
|
|
|
|
|
|
|
|
|
|
/* Extensions. */
|
|
|
|
|
RAFT_REC_NOTE, /* A note about some significant event. */
|
|
|
|
|
RAFT_REC_COMMIT_INDEX, /* An update to the local commit_index. */
|
|
|
|
|
RAFT_REC_LEADER, /* A server has become leader for this term. */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* Type used for the second and subsequent records in a Raft log. */
|
|
|
|
|
struct raft_record {
|
|
|
|
|
enum raft_record_type type;
|
|
|
|
|
char *comment;
|
|
|
|
|
|
|
|
|
|
/* Valid in RAFT_REC_ENTRY, RAFT_REC_TERM, RAFT_REC_LEADER, and
|
|
|
|
|
* RAFT_REC_VOTE, and otherwise 0. */
|
|
|
|
|
uint64_t term;
|
|
|
|
|
|
|
|
|
|
union {
|
|
|
|
|
char *note; /* RAFT_REC_NOTE. */
|
|
|
|
|
|
|
|
|
|
uint64_t commit_index; /* RAFT_REC_COMMIT_INDEX. */
|
|
|
|
|
|
|
|
|
|
struct uuid sid; /* RAFT_REC_VOTE, RAFT_REC_LEADER. */
|
|
|
|
|
|
|
|
|
|
struct { /* RAFT_REC_ENTRY. */
|
|
|
|
|
uint64_t index;
|
|
|
|
|
struct json *data;
|
|
|
|
|
struct json *servers;
|
2019-08-19 09:30:00 -07:00
|
|
|
|
uint64_t election_timer;
|
2017-12-31 21:15:58 -08:00
|
|
|
|
struct uuid eid;
|
|
|
|
|
} entry;
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void raft_record_uninit(struct raft_record *);
|
|
|
|
|
struct ovsdb_error *raft_record_from_json(struct raft_record *,
|
|
|
|
|
const struct json *)
|
|
|
|
|
OVS_WARN_UNUSED_RESULT;
|
|
|
|
|
struct json *raft_record_to_json(const struct raft_record *);
|
|
|
|
|
|
|
|
|
|
void raft_put_uint64(struct json *object, const char *name, uint64_t integer);
|
|
|
|
|
uint64_t raft_parse_optional_uint64(struct ovsdb_parser *, const char *name);
|
|
|
|
|
uint64_t raft_parse_required_uint64(struct ovsdb_parser *, const char *name);
|
|
|
|
|
|
|
|
|
|
bool raft_parse_required_boolean(struct ovsdb_parser *, const char *name);
|
|
|
|
|
int raft_parse_optional_boolean(struct ovsdb_parser *, const char *name);
|
|
|
|
|
const char *raft_parse_required_string(struct ovsdb_parser *,
|
|
|
|
|
const char *name);
|
|
|
|
|
const char *raft_parse_optional_string(struct ovsdb_parser *,
|
|
|
|
|
const char *name);
|
|
|
|
|
bool raft_parse_uuid(struct ovsdb_parser *, const char *name, bool optional,
|
|
|
|
|
struct uuid *);
|
|
|
|
|
struct uuid raft_parse_required_uuid(struct ovsdb_parser *, const char *name);
|
|
|
|
|
bool raft_parse_optional_uuid(struct ovsdb_parser *, const char *name,
|
|
|
|
|
struct uuid *);
|
|
|
|
|
|
|
|
|
|
#endif /* raft-private.h */
|