2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-08-23 18:49:54 +00:00
bind/lib/dns/rbt.c

3760 lines
92 KiB
C
Raw Normal View History

/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
/*! \file */
1999-03-18 21:21:31 +00:00
#include <inttypes.h>
#include <stdbool.h>
#include <sys/stat.h>
#include <isc/crc64.h>
#include <isc/file.h>
#include <isc/hex.h>
#include <isc/mem.h>
#include <isc/once.h>
#include <isc/platform.h>
2001-03-09 23:38:00 +00:00
#include <isc/print.h>
#include <isc/refcount.h>
#include <isc/socket.h>
#include <isc/stdio.h>
#include <isc/string.h>
2000-04-28 01:08:52 +00:00
#include <isc/util.h>
/*%
* This define is so dns/name.h (included by dns/fixedname.h) uses more
* efficient macro calls instead of functions for a few operations.
*/
#define DNS_NAME_USEINLINE 1
#include <unistd.h>
#include <dns/fixedname.h>
#include <dns/log.h>
#include <dns/rbt.h>
#include <dns/result.h>
#define CHECK(x) \
do { \
result = (x); \
if (result != ISC_R_SUCCESS) \
goto cleanup; \
} while (0)
2020-02-13 14:44:37 -08:00
#define RBT_MAGIC ISC_MAGIC('R', 'B', 'T', '+')
#define VALID_RBT(rbt) ISC_MAGIC_VALID(rbt, RBT_MAGIC)
/*
* XXXDCL Since parent pointers were added in again, I could remove all of the
* chain junk, and replace with dns_rbt_firstnode, _previousnode, _nextnode,
* _lastnode. This would involve pretty major change to the API.
*/
2020-02-13 14:44:37 -08:00
#define CHAIN_MAGIC ISC_MAGIC('0', '-', '0', '-')
#define VALID_CHAIN(chain) ISC_MAGIC_VALID(chain, CHAIN_MAGIC)
1999-04-09 15:21:15 +00:00
#define RBT_HASH_MIN_BITS 4
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
#define RBT_HASH_MAX_BITS 32
#define RBT_HASH_OVERCOMMIT 3
#define RBT_HASH_BUCKETSIZE 4096 /* FIXME: What would be a good value here? */
#ifdef RBT_MEM_TEST
#undef RBT_HASH_SIZE
#define RBT_HASH_SIZE 2 /*%< To give the reallocation code a workout. */
#endif /* ifdef RBT_MEM_TEST */
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
#define GOLDEN_RATIO_32 0x61C88647
#define HASHSIZE(bits) (UINT64_C(1) << (bits))
static inline uint32_t
hash_32(uint32_t val, unsigned int bits) {
REQUIRE(bits <= RBT_HASH_MAX_BITS);
/* High bits are more random. */
return (val * GOLDEN_RATIO_32 >> (32 - bits));
}
struct dns_rbt {
2020-02-13 14:44:37 -08:00
unsigned int magic;
isc_mem_t *mctx;
dns_rbtnode_t *root;
void (*data_deleter)(void *, void *);
2020-02-13 14:44:37 -08:00
void *deleter_arg;
unsigned int nodecount;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
uint16_t hashbits;
uint16_t maxhashbits;
dns_rbtnode_t **hashtable;
2020-02-13 14:44:37 -08:00
void *mmap_location;
};
2020-02-13 14:44:37 -08:00
#define RED 0
#define BLACK 1
/*
* This is the header for map-format RBT images. It is populated,
* and then written, as the LAST thing done to the file before returning.
* Writing this last (with zeros in the header area initially) will ensure
* that the header is only valid when the RBT image is also valid.
*/
typedef struct file_header file_header_t;
/* Pad to 32 bytes */
static char FILE_VERSION[32] = "\0";
/* Header length, always the same size regardless of structure size */
#define HEADER_LENGTH 1024
struct file_header {
2020-02-13 14:44:37 -08:00
char version1[32];
uint64_t first_node_offset; /* usually 1024 */
2012-06-20 23:46:40 +00:00
/*
* information about the system on which the map file was generated
* will be used to tell if we can load the map file or not
*/
2020-02-13 14:44:37 -08:00
uint32_t ptrsize;
unsigned int bigendian : 1; /* big or little endian system */
unsigned int rdataset_fixed : 1; /* compiled with
* --enable-rrset-fixed
*/
unsigned int nodecount; /* shadow from rbt structure */
2020-02-13 14:44:37 -08:00
uint64_t crc;
char version2[32]; /* repeated; must match version1 */
};
/*
* The following declarations are for the serialization of an RBT:
*
* step one: write out a zeroed header of 1024 bytes
* step two: walk the tree in a depth-first, left-right-down order, writing
* out the nodes, reserving space as we go, correcting addresses to point
2012-06-20 23:46:40 +00:00
* at the proper offset in the file, and setting a flag for each pointer to
* indicate that it is a reference to a location in the file, rather than in
* memory.
2012-06-20 23:46:40 +00:00
* step three: write out the header, adding the information that will be
* needed to re-create the tree object itself.
*
* The RBTDB object will do this three times, once for each of the three
* RBT objects it contains.
*
* Note: 'file' must point an actual open file that can be mmapped
* and fseeked, not to a pipe or stream
*/
2020-02-14 08:14:03 +01:00
static isc_result_t
dns_rbt_zero_header(FILE *file);
2020-02-14 08:14:03 +01:00
static isc_result_t
write_header(FILE *file, dns_rbt_t *rbt, uint64_t first_node_offset,
uint64_t crc);
2020-02-14 08:14:03 +01:00
static bool
match_header_version(file_header_t *header);
2020-02-14 08:14:03 +01:00
static isc_result_t
serialize_node(FILE *file, dns_rbtnode_t *node, uintptr_t left, uintptr_t right,
uintptr_t down, uintptr_t parent, uintptr_t data, uint64_t *crc);
2020-02-14 08:14:03 +01:00
static isc_result_t
serialize_nodes(FILE *file, dns_rbtnode_t *node, uintptr_t parent,
dns_rbtdatawriter_t datawriter, void *writer_arg,
uintptr_t *where, uint64_t *crc);
/*
2012-06-20 23:46:40 +00:00
* The following functions allow you to get the actual address of a pointer
* without having to use an if statement to check to see if that address is
* relative or not
*/
static inline dns_rbtnode_t *
2020-02-13 14:44:37 -08:00
getparent(dns_rbtnode_t *node, file_header_t *header) {
char *adjusted_address = (char *)(node->parent);
adjusted_address += node->parent_is_relative * (uintptr_t)header;
2012-06-20 23:46:40 +00:00
return ((dns_rbtnode_t *)adjusted_address);
}
static inline dns_rbtnode_t *
2020-02-13 14:44:37 -08:00
getleft(dns_rbtnode_t *node, file_header_t *header) {
char *adjusted_address = (char *)(node->left);
adjusted_address += node->left_is_relative * (uintptr_t)header;
2012-06-20 23:46:40 +00:00
return ((dns_rbtnode_t *)adjusted_address);
}
static inline dns_rbtnode_t *
2020-02-13 14:44:37 -08:00
getright(dns_rbtnode_t *node, file_header_t *header) {
char *adjusted_address = (char *)(node->right);
adjusted_address += node->right_is_relative * (uintptr_t)header;
2012-06-20 23:46:40 +00:00
return ((dns_rbtnode_t *)adjusted_address);
}
static inline dns_rbtnode_t *
2020-02-13 14:44:37 -08:00
getdown(dns_rbtnode_t *node, file_header_t *header) {
char *adjusted_address = (char *)(node->down);
adjusted_address += node->down_is_relative * (uintptr_t)header;
2012-06-20 23:46:40 +00:00
return ((dns_rbtnode_t *)adjusted_address);
}
static inline dns_rbtnode_t *
2020-02-13 14:44:37 -08:00
getdata(dns_rbtnode_t *node, file_header_t *header) {
char *adjusted_address = (char *)(node->data);
adjusted_address += node->data_is_relative * (uintptr_t)header;
2012-06-20 23:46:40 +00:00
return ((dns_rbtnode_t *)adjusted_address);
}
/*%
* Elements of the rbtnode structure.
*/
2020-02-13 14:44:37 -08:00
#define PARENT(node) ((node)->parent)
#define LEFT(node) ((node)->left)
#define RIGHT(node) ((node)->right)
#define DOWN(node) ((node)->down)
#define UPPERNODE(node) ((node)->uppernode)
#define DATA(node) ((node)->data)
#define IS_EMPTY(node) ((node)->data == NULL)
#define HASHNEXT(node) ((node)->hashnext)
#define HASHVAL(node) ((node)->hashval)
#define COLOR(node) ((node)->color)
#define NAMELEN(node) ((node)->namelen)
#define OLDNAMELEN(node) ((node)->oldnamelen)
#define OFFSETLEN(node) ((node)->offsetlen)
#define ATTRS(node) ((node)->attributes)
#define IS_ROOT(node) ((node)->is_root)
#define FINDCALLBACK(node) ((node)->find_callback)
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
#define WANTEMPTYDATA_OR_DATA(options, node) \
((options & DNS_RBTFIND_EMPTYDATA) != 0 || DATA(node) != NULL)
/*%
* Structure elements from the rbtdb.c, not
* used as part of the rbt.c algorithms.
*/
2020-02-13 14:44:37 -08:00
#define DIRTY(node) ((node)->dirty)
#define WILD(node) ((node)->wild)
#define LOCKNUM(node) ((node)->locknum)
/*%
* The variable length stuff stored after the node has the following
* structure.
*
* &lt;name_data&gt;{1..255}&lt;oldoffsetlen&gt;{1}&lt;offsets&gt;{1..128}
*
* &lt;name_data&gt; contains the name of the node when it was created.
* &lt;oldoffsetlen&gt; contains the length of &lt;offsets&gt; when the node
* was created.
* &lt;offsets&gt; contains the offsets into name for each label when the node
* was created.
*/
2020-02-13 14:44:37 -08:00
#define NAME(node) ((unsigned char *)((node) + 1))
#define OFFSETS(node) (NAME(node) + OLDNAMELEN(node) + 1)
#define OLDOFFSETLEN(node) (OFFSETS(node)[-1])
#define NODE_SIZE(node) \
(sizeof(*node) + OLDNAMELEN(node) + OLDOFFSETLEN(node) + 1)
/*%
* Color management.
*/
2020-02-13 14:44:37 -08:00
#define IS_RED(node) ((node) != NULL && (node)->color == RED)
#define IS_BLACK(node) ((node) == NULL || (node)->color == BLACK)
#define MAKE_RED(node) ((node)->color = RED)
#define MAKE_BLACK(node) ((node)->color = BLACK)
/*%
1999-04-09 15:21:15 +00:00
* Chain management.
*
* The "ancestors" member of chains were removed, with their job now
2009-01-17 14:45:17 +00:00
* being wholly handled by parent pointers (which didn't exist, because
* of memory concerns, when chains were first implemented).
1999-04-09 15:21:15 +00:00
*/
#define ADD_LEVEL(chain, node) \
do { \
INSIST((chain)->level_count < DNS_RBT_LEVELBLOCK); \
(chain)->levels[(chain)->level_count++] = (node); \
} while (0)
1999-02-06 01:27:35 +00:00
/*%
1999-02-06 01:27:35 +00:00
* The following macros directly access normally private name variables.
* These macros are used to avoid a lot of function calls in the critical
* path of the tree traversal code.
*/
static inline void
2020-02-13 14:44:37 -08:00
NODENAME(dns_rbtnode_t *node, dns_name_t *name) {
name->length = NAMELEN(node);
name->labels = OFFSETLEN(node);
name->ndata = NAME(node);
name->offsets = OFFSETS(node);
name->attributes = ATTRS(node);
name->attributes |= DNS_NAMEATTR_READONLY;
}
1999-02-06 01:27:35 +00:00
#ifdef DEBUG
#define inline
/*
* A little something to help out in GDB.
*/
2020-02-14 08:14:03 +01:00
dns_name_t
Name(dns_rbtnode_t *node);
dns_name_t
2020-02-13 14:44:37 -08:00
Name(dns_rbtnode_t *node) {
2008-01-22 23:28:04 +00:00
dns_name_t name;
2008-01-22 23:28:04 +00:00
dns_name_init(&name, NULL);
if (node != NULL) {
2008-01-22 23:28:04 +00:00
NODENAME(node, &name);
}
2008-01-22 23:28:04 +00:00
return (name);
}
static void
2020-02-13 14:44:37 -08:00
hexdump(const char *desc, unsigned char *data, size_t size) {
char hexdump[BUFSIZ * 2 + 1];
isc_buffer_t b;
isc_region_t r;
isc_result_t result;
2020-02-13 14:44:37 -08:00
size_t bytes;
fprintf(stderr, "%s: ", desc);
do {
isc_buffer_init(&b, hexdump, sizeof(hexdump));
r.base = data;
r.length = bytes = (size > BUFSIZ) ? BUFSIZ : size;
result = isc_hex_totext(&r, 0, "", &b);
RUNTIME_CHECK(result == ISC_R_SUCCESS);
isc_buffer_putuint8(&b, 0);
fprintf(stderr, "%s", hexdump);
data += bytes;
size -= bytes;
} while (size > 0);
fprintf(stderr, "\n");
}
#endif /* DEBUG */
2015-12-09 08:54:04 -08:00
/*
* Upper node is the parent of the root of the passed node's
2015-12-09 19:07:20 +05:30
* subtree. The passed node must not be NULL.
*/
static inline dns_rbtnode_t *
2020-02-13 14:44:37 -08:00
get_upper_node(dns_rbtnode_t *node) {
2015-12-09 19:07:20 +05:30
return (UPPERNODE(node));
}
static void
2020-02-13 14:44:37 -08:00
fixup_uppernodes_helper(dns_rbtnode_t *node, dns_rbtnode_t *uppernode) {
if (node == NULL) {
2015-12-09 19:07:20 +05:30
return;
}
2015-12-09 19:07:20 +05:30
UPPERNODE(node) = uppernode;
fixup_uppernodes_helper(LEFT(node), uppernode);
fixup_uppernodes_helper(RIGHT(node), uppernode);
fixup_uppernodes_helper(DOWN(node), node);
}
/*
* This function is used to fixup uppernode members of all dns_rbtnodes
* after deserialization.
*/
static void
2020-02-13 14:44:37 -08:00
fixup_uppernodes(dns_rbt_t *rbt) {
2015-12-09 19:07:20 +05:30
fixup_uppernodes_helper(rbt->root, NULL);
}
size_t
2020-02-13 14:44:37 -08:00
dns__rbtnode_getdistance(dns_rbtnode_t *node) {
size_t nodes = 1;
while (node != NULL) {
if (IS_ROOT(node)) {
break;
}
nodes++;
node = PARENT(node);
}
return (nodes);
}
/*
* Forward declarations.
*/
2020-02-14 08:14:03 +01:00
static isc_result_t
create_node(isc_mem_t *mctx, const dns_name_t *name, dns_rbtnode_t **nodep);
2020-02-14 08:14:03 +01:00
static isc_result_t
inithash(dns_rbt_t *rbt);
2020-02-14 08:14:03 +01:00
static inline void
hash_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name);
2020-02-14 08:14:03 +01:00
static inline void
unhash_node(dns_rbt_t *rbt, dns_rbtnode_t *node);
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
static uint32_t
rehash_bits(dns_rbt_t *rbt, size_t newcount);
2020-02-14 08:14:03 +01:00
static void
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
rehash(dns_rbt_t *rbt, uint32_t newbits);
static void
maybe_rehash(dns_rbt_t *rbt, size_t size);
2020-02-14 08:14:03 +01:00
static inline void
rotate_left(dns_rbtnode_t *node, dns_rbtnode_t **rootp);
static inline void
rotate_right(dns_rbtnode_t *node, dns_rbtnode_t **rootp);
2020-02-14 08:14:03 +01:00
static void
addonlevel(dns_rbtnode_t *node, dns_rbtnode_t *current, int order,
dns_rbtnode_t **rootp);
2020-02-14 08:14:03 +01:00
static void
deletefromlevel(dns_rbtnode_t *item, dns_rbtnode_t **rootp);
2020-02-14 08:14:03 +01:00
static isc_result_t
treefix(dns_rbt_t *rbt, void *base, size_t size, dns_rbtnode_t *n,
const dns_name_t *name, dns_rbtdatafixer_t datafixer, void *fixer_arg,
uint64_t *crc);
2020-02-14 08:14:03 +01:00
static void
deletetreeflat(dns_rbt_t *rbt, unsigned int quantum, bool unhash,
dns_rbtnode_t **nodep);
2020-02-14 08:14:03 +01:00
static void
printnodename(dns_rbtnode_t *node, bool quoted, FILE *f);
2020-02-14 08:14:03 +01:00
static void
freenode(dns_rbt_t *rbt, dns_rbtnode_t **nodep);
2012-08-24 10:44:26 +10:00
static isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbt_zero_header(FILE *file) {
/*
* Write out a zeroed header as a placeholder. Doing this ensures
* that the file will not read while it is partially written, should
* writing fail or be interrupted.
*/
2020-02-13 14:44:37 -08:00
char buffer[HEADER_LENGTH];
isc_result_t result;
2012-06-20 23:46:40 +00:00
memset(buffer, 0, HEADER_LENGTH);
result = isc_stdio_write(buffer, 1, HEADER_LENGTH, file, NULL);
if (result != ISC_R_SUCCESS) {
return (result);
}
result = fflush(file);
if (result != ISC_R_SUCCESS) {
return (result);
}
2012-06-20 23:46:40 +00:00
return (ISC_R_SUCCESS);
}
static isc_once_t once = ISC_ONCE_INIT;
static void
2020-02-13 14:44:37 -08:00
init_file_version(void) {
int n;
memset(FILE_VERSION, 0, sizeof(FILE_VERSION));
n = snprintf(FILE_VERSION, sizeof(FILE_VERSION), "RBT Image %s %s",
Complete rewrite the BIND 9 build system The rewrite of BIND 9 build system is a large work and cannot be reasonable split into separate merge requests. Addition of the automake has a positive effect on the readability and maintainability of the build system as it is more declarative, it allows conditional and we are able to drop all of the custom make code that BIND 9 developed over the years to overcome the deficiencies of autoconf + custom Makefile.in files. This squashed commit contains following changes: - conversion (or rather fresh rewrite) of all Makefile.in files to Makefile.am by using automake - the libtool is now properly integrated with automake (the way we used it was rather hackish as the only official way how to use libtool is via automake - the dynamic module loading was rewritten from a custom patchwork to libtool's libltdl (which includes the patchwork to support module loading on different systems internally) - conversion of the unit test executor from kyua to automake parallel driver - conversion of the system test executor from custom make/shell to automake parallel driver - The GSSAPI has been refactored, the custom SPNEGO on the basis that all major KRB5/GSSAPI (mit-krb5, heimdal and Windows) implementations support SPNEGO mechanism. - The various defunct tests from bin/tests have been removed: bin/tests/optional and bin/tests/pkcs11 - The text files generated from the MD files have been removed, the MarkDown has been designed to be readable by both humans and computers - The xsl header is now generated by a simple sed command instead of perl helper - The <irs/platform.h> header has been removed - cleanups of configure.ac script to make it more simpler, addition of multiple macros (there's still work to be done though) - the tarball can now be prepared with `make dist` - the system tests are partially able to run in oot build Here's a list of unfinished work that needs to be completed in subsequent merge requests: - `make distcheck` doesn't yet work (because of system tests oot run is not yet finished) - documentation is not yet built, there's a different merge request with docbook to sphinx-build rst conversion that needs to be rebased and adapted on top of the automake - msvc build is non functional yet and we need to decide whether we will just cross-compile bind9 using mingw-w64 or fix the msvc build - contributed dlz modules are not included neither in the autoconf nor automake
2018-08-07 16:46:53 +02:00
PACKAGE_VERSION_MAJOR, MAPAPI);
INSIST(n > 0 && (unsigned int)n < sizeof(FILE_VERSION));
}
/*
* Write out the real header, including NodeDump version information
* and the offset of the first node.
*
* Any information stored in the rbt object itself should be stored
* here.
*/
2012-06-20 23:46:40 +00:00
static isc_result_t
write_header(FILE *file, dns_rbt_t *rbt, uint64_t first_node_offset,
2020-02-13 14:44:37 -08:00
uint64_t crc) {
file_header_t header;
2020-02-13 14:44:37 -08:00
isc_result_t result;
off_t location;
2012-06-20 23:46:40 +00:00
RUNTIME_CHECK(isc_once_do(&once, init_file_version) == ISC_R_SUCCESS);
memset(&header, 0, sizeof(file_header_t));
memmove(header.version1, FILE_VERSION, sizeof(header.version1));
memmove(header.version2, FILE_VERSION, sizeof(header.version2));
header.first_node_offset = first_node_offset;
header.ptrsize = (uint32_t)sizeof(void *);
header.bigendian = (1 == htonl(1)) ? 1 : 0;
2012-06-20 23:46:40 +00:00
#ifdef DNS_RDATASET_FIXED
header.rdataset_fixed = 1;
#else /* ifdef DNS_RDATASET_FIXED */
header.rdataset_fixed = 0;
#endif /* ifdef DNS_RDATASET_FIXED */
2012-06-20 23:46:40 +00:00
header.nodecount = rbt->nodecount;
header.crc = crc;
CHECK(isc_stdio_tell(file, &location));
location = dns_rbt_serialize_align(location);
CHECK(isc_stdio_seek(file, location, SEEK_SET));
CHECK(isc_stdio_write(&header, 1, sizeof(file_header_t), file, NULL));
CHECK(fflush(file));
2012-06-20 23:46:40 +00:00
/* Ensure we are always at the end of the file. */
CHECK(isc_stdio_seek(file, 0, SEEK_END));
cleanup:
return (result);
}
static bool
2020-02-13 14:44:37 -08:00
match_header_version(file_header_t *header) {
RUNTIME_CHECK(isc_once_do(&once, init_file_version) == ISC_R_SUCCESS);
if (memcmp(header->version1, FILE_VERSION, sizeof(header->version1)) !=
0 ||
memcmp(header->version2, FILE_VERSION, sizeof(header->version1)) !=
2020-02-13 14:44:37 -08:00
0)
{
return (false);
}
return (true);
}
unsigned int
dns__rbtnode_namelen(dns_rbtnode_t *node) {
dns_name_t current;
unsigned int len = 0;
REQUIRE(DNS_RBTNODE_VALID(node));
dns_name_init(&current, NULL);
do {
if (node != NULL) {
NODENAME(node, &current);
len += current.length;
} else {
len += 1;
break;
}
node = get_upper_node(node);
} while (!dns_name_isabsolute(&current));
return (len);
}
2012-06-20 23:46:40 +00:00
static isc_result_t
serialize_node(FILE *file, dns_rbtnode_t *node, uintptr_t left, uintptr_t right,
2020-02-13 14:44:37 -08:00
uintptr_t down, uintptr_t parent, uintptr_t data,
uint64_t *crc) {
isc_result_t result;
2020-02-13 14:44:37 -08:00
dns_rbtnode_t temp_node;
off_t file_position;
unsigned char *node_data = NULL;
2020-02-13 14:44:37 -08:00
size_t datasize;
#ifdef DEBUG
dns_name_t nodename;
#endif /* ifdef DEBUG */
INSIST(node != NULL);
CHECK(isc_stdio_tell(file, &file_position));
file_position = dns_rbt_serialize_align(file_position);
CHECK(isc_stdio_seek(file, file_position, SEEK_SET));
temp_node = *node;
temp_node.down_is_relative = 0;
temp_node.left_is_relative = 0;
temp_node.right_is_relative = 0;
temp_node.parent_is_relative = 0;
temp_node.data_is_relative = 0;
temp_node.is_mmapped = 1;
2012-06-20 23:46:40 +00:00
/*
* If the next node is not NULL, calculate the next node's location
* in the file. Note that this will have to change when the data
* structure changes, and it also assumes that we always write the
* nodes out in list order (which we currently do.)
*/
if (temp_node.parent != NULL) {
temp_node.parent = (dns_rbtnode_t *)(parent);
temp_node.parent_is_relative = 1;
}
if (temp_node.left != NULL) {
temp_node.left = (dns_rbtnode_t *)(left);
temp_node.left_is_relative = 1;
}
if (temp_node.right != NULL) {
temp_node.right = (dns_rbtnode_t *)(right);
temp_node.right_is_relative = 1;
}
if (temp_node.down != NULL) {
temp_node.down = (dns_rbtnode_t *)(down);
temp_node.down_is_relative = 1;
}
if (temp_node.data != NULL) {
temp_node.data = (dns_rbtnode_t *)(data);
temp_node.data_is_relative = 1;
}
temp_node.fullnamelen = dns__rbtnode_namelen(node);
node_data = (unsigned char *)node + sizeof(dns_rbtnode_t);
datasize = NODE_SIZE(node) - sizeof(dns_rbtnode_t);
CHECK(isc_stdio_write(&temp_node, 1, sizeof(dns_rbtnode_t), file,
NULL));
CHECK(isc_stdio_write(node_data, 1, datasize, file, NULL));
#ifdef DEBUG
dns_name_init(&nodename, NULL);
NODENAME(node, &nodename);
fprintf(stderr, "serialize ");
dns_name_print(&nodename, stderr);
fprintf(stderr, "\n");
hexdump("node header", (unsigned char *)&temp_node,
sizeof(dns_rbtnode_t));
hexdump("node data", node_data, datasize);
#endif /* ifdef DEBUG */
isc_crc64_update(crc, (const uint8_t *)&temp_node,
sizeof(dns_rbtnode_t));
isc_crc64_update(crc, (const uint8_t *)node_data, datasize);
cleanup:
return (result);
}
static isc_result_t
serialize_nodes(FILE *file, dns_rbtnode_t *node, uintptr_t parent,
dns_rbtdatawriter_t datawriter, void *writer_arg,
2020-02-13 14:44:37 -08:00
uintptr_t *where, uint64_t *crc) {
uintptr_t left = 0, right = 0, down = 0, data = 0;
off_t location = 0, offset_adjust;
isc_result_t result;
2012-06-20 23:46:40 +00:00
if (node == NULL) {
if (where != NULL) {
*where = 0;
}
return (ISC_R_SUCCESS);
}
/* Reserve space for current node. */
CHECK(isc_stdio_tell(file, &location));
location = dns_rbt_serialize_align(location);
CHECK(isc_stdio_seek(file, location, SEEK_SET));
offset_adjust = dns_rbt_serialize_align(location + NODE_SIZE(node));
CHECK(isc_stdio_seek(file, offset_adjust, SEEK_SET));
2012-06-20 23:46:40 +00:00
/*
* Serialize the rest of the tree.
*
* WARNING: A change in the order (from left, right, down)
* will break the way the crc hash is computed.
*/
CHECK(serialize_nodes(file, getleft(node, NULL), location, datawriter,
writer_arg, &left, crc));
CHECK(serialize_nodes(file, getright(node, NULL), location, datawriter,
writer_arg, &right, crc));
CHECK(serialize_nodes(file, getdown(node, NULL), location, datawriter,
writer_arg, &down, crc));
if (node->data != NULL) {
off_t ret;
2012-10-24 23:46:51 +00:00
CHECK(isc_stdio_tell(file, &ret));
ret = dns_rbt_serialize_align(ret);
CHECK(isc_stdio_seek(file, ret, SEEK_SET));
data = ret;
datawriter(file, node->data, writer_arg, crc);
}
2012-06-20 23:46:40 +00:00
/* Seek back to reserved space. */
CHECK(isc_stdio_seek(file, location, SEEK_SET));
/* Serialize the current node. */
CHECK(serialize_node(file, node, left, right, down, parent, data, crc));
/* Ensure we are always at the end of the file. */
CHECK(isc_stdio_seek(file, 0, SEEK_END));
if (where != NULL) {
*where = (uintptr_t)location;
}
cleanup:
return (result);
}
off_t
2020-02-13 14:44:37 -08:00
dns_rbt_serialize_align(off_t target) {
off_t offset = target % 8;
if (offset == 0) {
return (target);
} else {
return (target + 8 - offset);
}
}
2012-06-20 23:46:40 +00:00
isc_result_t
dns_rbt_serialize_tree(FILE *file, dns_rbt_t *rbt,
dns_rbtdatawriter_t datawriter, void *writer_arg,
2020-02-13 14:44:37 -08:00
off_t *offset) {
isc_result_t result;
2020-02-13 14:44:37 -08:00
off_t header_position, node_position, end_position;
uint64_t crc;
REQUIRE(file != NULL);
2012-06-20 23:46:40 +00:00
CHECK(isc_file_isplainfilefd(fileno(file)));
isc_crc64_init(&crc);
CHECK(isc_stdio_tell(file, &header_position));
/* Write dummy header */
CHECK(dns_rbt_zero_header(file));
/* Serialize nodes */
CHECK(isc_stdio_tell(file, &node_position));
CHECK(serialize_nodes(file, rbt->root, 0, datawriter, writer_arg, NULL,
&crc));
2012-06-20 23:46:40 +00:00
CHECK(isc_stdio_tell(file, &end_position));
if (node_position == end_position) {
CHECK(isc_stdio_seek(file, header_position, SEEK_SET));
*offset = 0;
return (ISC_R_SUCCESS);
}
2012-06-20 23:46:40 +00:00
isc_crc64_final(&crc);
#ifdef DEBUG
hexdump("serializing CRC", (unsigned char *)&crc, sizeof(crc));
#endif /* ifdef DEBUG */
/* Serialize header */
CHECK(isc_stdio_seek(file, header_position, SEEK_SET));
CHECK(write_header(file, rbt, HEADER_LENGTH, crc));
/* Ensure we are always at the end of the file. */
CHECK(isc_stdio_seek(file, 0, SEEK_END));
*offset = dns_rbt_serialize_align(header_position);
cleanup:
return (result);
}
#define CONFIRM(a) \
do { \
if (!(a)) { \
result = ISC_R_INVALIDFILE; \
goto cleanup; \
} \
} while (0);
static isc_result_t
treefix(dns_rbt_t *rbt, void *base, size_t filesize, dns_rbtnode_t *n,
const dns_name_t *name, dns_rbtdatafixer_t datafixer, void *fixer_arg,
2020-02-13 14:44:37 -08:00
uint64_t *crc) {
isc_result_t result = ISC_R_SUCCESS;
dns_fixedname_t fixed;
2020-02-13 14:44:37 -08:00
dns_name_t nodename, *fullname;
unsigned char *node_data;
dns_rbtnode_t header;
size_t datasize, nodemax = filesize - sizeof(dns_rbtnode_t);
if (n == NULL) {
return (ISC_R_SUCCESS);
}
CONFIRM((void *)n >= base);
CONFIRM((char *)n - (char *)base <= (int)nodemax);
CONFIRM(DNS_RBTNODE_VALID(n));
dns_name_init(&nodename, NULL);
NODENAME(n, &nodename);
fullname = &nodename;
CONFIRM(dns_name_isvalid(fullname));
if (!dns_name_isabsolute(&nodename)) {
fullname = dns_fixedname_initname(&fixed);
CHECK(dns_name_concatenate(&nodename, name, fullname, NULL));
}
2012-06-20 23:46:40 +00:00
/* memorize header contents prior to fixup */
memmove(&header, n, sizeof(header));
if (n->left_is_relative) {
CONFIRM(n->left <= (dns_rbtnode_t *)nodemax);
n->left = getleft(n, rbt->mmap_location);
n->left_is_relative = 0;
CONFIRM(DNS_RBTNODE_VALID(n->left));
} else {
CONFIRM(n->left == NULL);
}
if (n->right_is_relative) {
CONFIRM(n->right <= (dns_rbtnode_t *)nodemax);
n->right = getright(n, rbt->mmap_location);
n->right_is_relative = 0;
CONFIRM(DNS_RBTNODE_VALID(n->right));
} else {
CONFIRM(n->right == NULL);
}
if (n->down_is_relative) {
CONFIRM(n->down <= (dns_rbtnode_t *)nodemax);
n->down = getdown(n, rbt->mmap_location);
n->down_is_relative = 0;
CONFIRM(n->down > (dns_rbtnode_t *)n);
CONFIRM(DNS_RBTNODE_VALID(n->down));
} else {
CONFIRM(n->down == NULL);
}
if (n->parent_is_relative) {
CONFIRM(n->parent <= (dns_rbtnode_t *)nodemax);
n->parent = getparent(n, rbt->mmap_location);
n->parent_is_relative = 0;
CONFIRM(n->parent < (dns_rbtnode_t *)n);
CONFIRM(DNS_RBTNODE_VALID(n->parent));
} else {
CONFIRM(n->parent == NULL);
}
if (n->data_is_relative) {
CONFIRM(n->data <= (void *)filesize);
n->data = getdata(n, rbt->mmap_location);
n->data_is_relative = 0;
CONFIRM(n->data > (void *)n);
} else {
CONFIRM(n->data == NULL);
}
2012-06-20 23:46:40 +00:00
hash_node(rbt, n, fullname);
2012-06-20 23:46:40 +00:00
/* a change in the order (from left, right, down) will break hashing*/
if (n->left != NULL) {
CHECK(treefix(rbt, base, filesize, n->left, name, datafixer,
fixer_arg, crc));
}
if (n->right != NULL) {
CHECK(treefix(rbt, base, filesize, n->right, name, datafixer,
fixer_arg, crc));
}
if (n->down != NULL) {
CHECK(treefix(rbt, base, filesize, n->down, fullname, datafixer,
fixer_arg, crc));
}
if (datafixer != NULL && n->data != NULL) {
CHECK(datafixer(n, base, filesize, fixer_arg, crc));
}
rbt->nodecount++;
node_data = (unsigned char *)n + sizeof(dns_rbtnode_t);
datasize = NODE_SIZE(n) - sizeof(dns_rbtnode_t);
#ifdef DEBUG
fprintf(stderr, "deserialize ");
dns_name_print(&nodename, stderr);
fprintf(stderr, "\n");
hexdump("node header", (unsigned char *)&header, sizeof(dns_rbtnode_t));
hexdump("node data", node_data, datasize);
#endif /* ifdef DEBUG */
isc_crc64_update(crc, (const uint8_t *)&header, sizeof(dns_rbtnode_t));
isc_crc64_update(crc, (const uint8_t *)node_data, datasize);
cleanup:
return (result);
}
2012-06-20 23:46:40 +00:00
isc_result_t
dns_rbt_deserialize_tree(void *base_address, size_t filesize,
off_t header_offset, isc_mem_t *mctx,
dns_rbtdeleter_t deleter, void *deleter_arg,
dns_rbtdatafixer_t datafixer, void *fixer_arg,
2020-02-13 14:44:37 -08:00
dns_rbtnode_t **originp, dns_rbt_t **rbtp) {
isc_result_t result = ISC_R_SUCCESS;
file_header_t *header;
2020-02-13 14:44:37 -08:00
dns_rbt_t *rbt = NULL;
uint64_t crc;
unsigned int host_big_endian;
REQUIRE(originp == NULL || *originp == NULL);
REQUIRE(rbtp != NULL && *rbtp == NULL);
2012-06-20 23:46:40 +00:00
isc_crc64_init(&crc);
CHECK(dns_rbt_create(mctx, deleter, deleter_arg, &rbt));
rbt->mmap_location = base_address;
header = (file_header_t *)((char *)base_address + header_offset);
if (!match_header_version(header)) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
2012-06-20 23:46:40 +00:00
#ifdef DNS_RDATASET_FIXED
if (header->rdataset_fixed != 1) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
2012-06-20 23:46:40 +00:00
#else /* ifdef DNS_RDATASET_FIXED */
if (header->rdataset_fixed != 0) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
#endif /* ifdef DNS_RDATASET_FIXED */
if (header->ptrsize != (uint32_t)sizeof(void *)) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
host_big_endian = (1 == htonl(1));
if (header->bigendian != host_big_endian) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
2012-06-20 23:46:40 +00:00
/* Copy other data items from the header into our rbt. */
rbt->root = (dns_rbtnode_t *)((char *)base_address + header_offset +
header->first_node_offset);
if ((header->nodecount * sizeof(dns_rbtnode_t)) > filesize) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
maybe_rehash(rbt, header->nodecount);
CHECK(treefix(rbt, base_address, filesize, rbt->root, dns_rootname,
datafixer, fixer_arg, &crc));
isc_crc64_final(&crc);
#ifdef DEBUG
hexdump("deserializing CRC", (unsigned char *)&crc, sizeof(crc));
#endif /* ifdef DEBUG */
/* Check file hash */
if (header->crc != crc) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
2015-12-09 19:07:20 +05:30
if (header->nodecount != rbt->nodecount) {
result = ISC_R_INVALIDFILE;
goto cleanup;
}
fixup_uppernodes(rbt);
*rbtp = rbt;
if (originp != NULL) {
*originp = rbt->root;
}
cleanup:
if (result != ISC_R_SUCCESS && rbt != NULL) {
rbt->root = NULL;
rbt->nodecount = 0;
dns_rbt_destroy(&rbt);
}
return (result);
}
/*
* Initialize a red/black tree of trees.
*/
isc_result_t
dns_rbt_create(isc_mem_t *mctx, dns_rbtdeleter_t deleter, void *deleter_arg,
2020-02-13 14:44:37 -08:00
dns_rbt_t **rbtp) {
2008-01-22 23:28:04 +00:00
isc_result_t result;
2020-02-13 14:44:37 -08:00
dns_rbt_t *rbt;
2008-01-22 23:28:04 +00:00
REQUIRE(mctx != NULL);
REQUIRE(rbtp != NULL && *rbtp == NULL);
REQUIRE(deleter == NULL ? deleter_arg == NULL : 1);
rbt = isc_mem_get(mctx, sizeof(*rbt));
rbt->mctx = NULL;
isc_mem_attach(mctx, &rbt->mctx);
2008-01-22 23:28:04 +00:00
rbt->data_deleter = deleter;
rbt->deleter_arg = deleter_arg;
rbt->root = NULL;
rbt->nodecount = 0;
rbt->hashtable = NULL;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
rbt->hashbits = 0;
rbt->maxhashbits = RBT_HASH_MAX_BITS;
rbt->mmap_location = NULL;
2008-01-22 23:28:04 +00:00
result = inithash(rbt);
if (result != ISC_R_SUCCESS) {
isc_mem_putanddetach(&rbt->mctx, rbt, sizeof(*rbt));
2008-01-22 23:28:04 +00:00
return (result);
}
2008-01-22 23:28:04 +00:00
rbt->magic = RBT_MAGIC;
2008-01-22 23:28:04 +00:00
*rbtp = rbt;
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
/*
* Deallocate a red/black tree of trees.
*/
void
2020-02-13 14:44:37 -08:00
dns_rbt_destroy(dns_rbt_t **rbtp) {
2008-01-22 23:28:04 +00:00
RUNTIME_CHECK(dns_rbt_destroy2(rbtp, 0) == ISC_R_SUCCESS);
}
isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbt_destroy2(dns_rbt_t **rbtp, unsigned int quantum) {
2008-01-22 23:28:04 +00:00
dns_rbt_t *rbt;
2008-01-22 23:28:04 +00:00
REQUIRE(rbtp != NULL && VALID_RBT(*rbtp));
2008-01-22 23:28:04 +00:00
rbt = *rbtp;
deletetreeflat(rbt, quantum, false, &rbt->root);
if (rbt->root != NULL) {
2008-01-22 23:28:04 +00:00
return (ISC_R_QUOTA);
}
*rbtp = NULL;
2008-01-22 23:28:04 +00:00
INSIST(rbt->nodecount == 0);
2012-06-20 23:46:40 +00:00
rbt->mmap_location = NULL;
if (rbt->hashtable != NULL) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
size_t size = HASHSIZE(rbt->hashbits) * sizeof(dns_rbtnode_t *);
isc_mem_put(rbt->mctx, rbt->hashtable, size);
}
2008-01-22 23:28:04 +00:00
rbt->magic = 0;
isc_mem_putanddetach(&rbt->mctx, rbt, sizeof(*rbt));
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
unsigned int
2020-02-13 14:44:37 -08:00
dns_rbt_nodecount(dns_rbt_t *rbt) {
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
2008-01-22 23:28:04 +00:00
return (rbt->nodecount);
}
2015-12-09 19:07:20 +05:30
size_t
2020-02-13 14:44:37 -08:00
dns_rbt_hashsize(dns_rbt_t *rbt) {
REQUIRE(VALID_RBT(rbt));
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
return (1 << rbt->hashbits);
}
isc_result_t
dns_rbt_adjusthashsize(dns_rbt_t *rbt, size_t size) {
REQUIRE(VALID_RBT(rbt));
size_t newsize = size / RBT_HASH_BUCKETSIZE;
rbt->maxhashbits = rehash_bits(rbt, newsize);
maybe_rehash(rbt, newsize);
return (ISC_R_SUCCESS);
}
static inline isc_result_t
2020-02-13 14:44:37 -08:00
chain_name(dns_rbtnodechain_t *chain, dns_name_t *name,
bool include_chain_end) {
dns_name_t nodename;
2008-01-22 23:28:04 +00:00
isc_result_t result = ISC_R_SUCCESS;
2020-02-13 14:44:37 -08:00
int i;
2008-01-22 23:28:04 +00:00
dns_name_init(&nodename, NULL);
if (include_chain_end && chain->end != NULL) {
NODENAME(chain->end, &nodename);
dns_name_copynf(&nodename, name);
} else {
2008-01-22 23:28:04 +00:00
dns_name_reset(name);
}
2008-01-22 23:28:04 +00:00
for (i = (int)chain->level_count - 1; i >= 0; i--) {
NODENAME(chain->levels[i], &nodename);
result = dns_name_concatenate(name, &nodename, name, NULL);
if (result != ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
return (result);
}
2008-01-22 23:28:04 +00:00
}
return (result);
}
static inline isc_result_t
2020-02-13 14:44:37 -08:00
move_chain_to_last(dns_rbtnodechain_t *chain, dns_rbtnode_t *node) {
2008-01-22 23:28:04 +00:00
do {
/*
* Go as far right and then down as much as possible,
* as long as the rightmost node has a down pointer.
*/
while (RIGHT(node) != NULL) {
2008-01-22 23:28:04 +00:00
node = RIGHT(node);
}
if (DOWN(node) == NULL) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
ADD_LEVEL(chain, node);
node = DOWN(node);
} while (1);
2008-01-22 23:28:04 +00:00
chain->end = node;
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
/*
* Add 'name' to tree, initializing its data pointer with 'data'.
*/
isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbt_addnode(dns_rbt_t *rbt, const dns_name_t *name, dns_rbtnode_t **nodep) {
2008-01-22 23:28:04 +00:00
/*
* Does this thing have too many variables or what?
*/
dns_rbtnode_t **root, *parent, *child, *current, *new_current;
2020-02-13 14:44:37 -08:00
dns_name_t *add_name, *new_name, current_name, *prefix, *suffix;
2008-01-22 23:28:04 +00:00
dns_fixedname_t fixedcopy, fixedprefix, fixedsuffix, fnewname;
2020-02-13 14:44:37 -08:00
dns_offsets_t current_offsets;
dns_namereln_t compared;
isc_result_t result = ISC_R_SUCCESS;
unsigned int level_count;
unsigned int common_labels;
unsigned int nlabels, hlabels;
int order;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(dns_name_isabsolute(name));
REQUIRE(nodep != NULL && *nodep == NULL);
/*
* Dear future BIND developer,
*
* After you have tried attempting to optimize this routine by
* using the hashtable and have realized your folly, please
* append another cross ("X") below as a warning to the next
* future BIND developer:
*
* Number of victim developers: X
*
* I wish the past developer had included such a notice.
*
* Long form: Unlike dns_rbt_findnode(), this function does not
* lend itself to be optimized using the hashtable:
*
* 1. In the subtree where the insertion occurs, this function
* needs to have the insertion point and the order where the
* lookup terminated (i.e., at the insertion point where left or
* right child is NULL). This cannot be determined from the
* hashtable, so at least in that subtree, a BST O(log N) lookup
* is necessary.
*
* 2. Our RBT nodes contain not only single labels but label
* sequences to optimize space usage. So at every level, we have
* to look for a match in the hashtable for all superdomains in
* the rest of the name we're searching. This is an O(N)
* operation at least, here N being the label size of name, each
* of which is a hashtable lookup involving dns_name_equal()
* comparisons.
*/
2008-01-22 23:28:04 +00:00
/*
* Create a copy of the name so the original name structure is
* not modified.
*/
add_name = dns_fixedname_initname(&fixedcopy);
INSIST(add_name != NULL);
2008-01-22 23:28:04 +00:00
dns_name_clone(name, add_name);
if (ISC_UNLIKELY(rbt->root == NULL)) {
2008-01-22 23:28:04 +00:00
result = create_node(rbt->mctx, add_name, &new_current);
if (result == ISC_R_SUCCESS) {
rbt->nodecount++;
new_current->is_root = 1;
2015-12-09 19:07:20 +05:30
UPPERNODE(new_current) = NULL;
2008-01-22 23:28:04 +00:00
rbt->root = new_current;
*nodep = new_current;
hash_node(rbt, new_current, name);
}
return (result);
}
level_count = 0;
2008-01-22 23:28:04 +00:00
prefix = dns_fixedname_initname(&fixedprefix);
suffix = dns_fixedname_initname(&fixedsuffix);
2008-01-22 23:28:04 +00:00
INSIST(prefix != NULL);
INSIST(suffix != NULL);
2008-01-22 23:28:04 +00:00
root = &rbt->root;
INSIST(IS_ROOT(*root));
parent = NULL;
current = NULL;
child = *root;
dns_name_init(&current_name, current_offsets);
new_name = dns_fixedname_initname(&fnewname);
2008-01-22 23:28:04 +00:00
nlabels = dns_name_countlabels(name);
hlabels = 0;
do {
current = child;
NODENAME(current, &current_name);
compared = dns_name_fullcompare(add_name, &current_name, &order,
&common_labels);
2008-01-22 23:28:04 +00:00
if (compared == dns_namereln_equal) {
*nodep = current;
result = ISC_R_EXISTS;
break;
}
if (compared == dns_namereln_none) {
if (order < 0) {
parent = current;
child = LEFT(current);
} else if (order > 0) {
parent = current;
child = RIGHT(current);
}
} else {
/*
* This name has some suffix in common with the
* name at the current node. If the name at
* the current node is shorter, that means the
* new name should be in a subtree. If the
* name at the current node is longer, that means
* the down pointer to this tree should point
* to a new tree that has the common suffix, and
* the non-common parts of these two names should
* start a new tree.
*/
hlabels += common_labels;
if (compared == dns_namereln_subdomain) {
/*
* All of the existing labels are in common,
* so the new name is in a subtree.
* Whack off the common labels for the
* not-in-common part to be searched for
* in the next level.
*/
dns_name_split(add_name, common_labels,
add_name, NULL);
/*
* Follow the down pointer (possibly NULL).
*/
root = &DOWN(current);
INSIST(*root == NULL ||
(IS_ROOT(*root) &&
PARENT(*root) == current));
parent = NULL;
child = DOWN(current);
INSIST(level_count < DNS_RBT_LEVELBLOCK);
level_count++;
2008-01-22 23:28:04 +00:00
} else {
/*
* The number of labels in common is fewer
* than the number of labels at the current
* node, so the current node must be adjusted
* to have just the common suffix, and a down
* pointer made to a new tree.
*/
INSIST(compared ==
dns_namereln_commonancestor ||
compared == dns_namereln_contains);
2008-01-22 23:28:04 +00:00
/*
* Ensure the number of levels in the tree
* does not exceed the number of logical
* levels allowed by DNSSEC.
*
* XXXDCL need a better error result?
*/
if (level_count >= DNS_RBT_LEVELBLOCK) {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOSPACE;
break;
}
/*
* Split the name into two parts, a prefix
* which is the not-in-common parts of the
* two names and a suffix that is the common
* parts of them.
*/
dns_name_split(&current_name, common_labels,
prefix, suffix);
result = create_node(rbt->mctx, suffix,
&new_current);
if (result != ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
/*
* Reproduce the tree attributes of the
* current node.
*/
new_current->is_root = current->is_root;
if (current->nsec == DNS_RBT_NSEC_HAS_NSEC) {
new_current->nsec = DNS_RBT_NSEC_NORMAL;
} else {
new_current->nsec = current->nsec;
}
PARENT(new_current) = PARENT(current);
LEFT(new_current) = LEFT(current);
RIGHT(new_current) = RIGHT(current);
COLOR(new_current) = COLOR(current);
2008-01-22 23:28:04 +00:00
/*
* Fix pointers that were to the current node.
*/
if (parent != NULL) {
if (LEFT(parent) == current) {
2008-01-22 23:28:04 +00:00
LEFT(parent) = new_current;
} else {
2008-01-22 23:28:04 +00:00
RIGHT(parent) = new_current;
}
2008-01-22 23:28:04 +00:00
}
if (LEFT(new_current) != NULL) {
PARENT(LEFT(new_current)) = new_current;
}
if (RIGHT(new_current) != NULL) {
2008-01-22 23:28:04 +00:00
PARENT(RIGHT(new_current)) =
new_current;
}
if (*root == current) {
2008-01-22 23:28:04 +00:00
*root = new_current;
}
2008-01-22 23:28:04 +00:00
NAMELEN(current) = prefix->length;
OFFSETLEN(current) = prefix->labels;
/*
* Set up the new root of the next level.
* By definition it will not be the top
* level tree, so clear DNS_NAMEATTR_ABSOLUTE.
*/
current->is_root = 1;
PARENT(current) = new_current;
DOWN(new_current) = current;
root = &DOWN(new_current);
2015-12-09 19:07:20 +05:30
UPPERNODE(new_current) = UPPERNODE(current);
UPPERNODE(current) = new_current;
INSIST(level_count < DNS_RBT_LEVELBLOCK);
level_count++;
2008-01-22 23:28:04 +00:00
LEFT(current) = NULL;
RIGHT(current) = NULL;
MAKE_BLACK(current);
ATTRS(current) &= ~DNS_NAMEATTR_ABSOLUTE;
rbt->nodecount++;
dns_name_getlabelsequence(name,
nlabels - hlabels,
hlabels, new_name);
hash_node(rbt, new_current, new_name);
if (common_labels ==
dns_name_countlabels(add_name)) {
/*
* The name has been added by pushing
* the not-in-common parts down to
* a new level.
*/
*nodep = new_current;
return (ISC_R_SUCCESS);
} else {
/*
* The current node has no data,
* because it is just a placeholder.
* Its data pointer is already NULL
* from create_node()), so there's
* nothing more to do to it.
*/
/*
* The not-in-common parts of the new
* name will be inserted into the new
* level following this loop (unless
* result != ISC_R_SUCCESS, which
* is tested after the loop ends).
*/
dns_name_split(add_name, common_labels,
add_name, NULL);
break;
}
}
}
} while (ISC_LIKELY(child != NULL));
2008-01-22 23:28:04 +00:00
if (ISC_LIKELY(result == ISC_R_SUCCESS)) {
2008-01-22 23:28:04 +00:00
result = create_node(rbt->mctx, add_name, &new_current);
}
2008-01-22 23:28:04 +00:00
if (ISC_LIKELY(result == ISC_R_SUCCESS)) {
if (*root == NULL) {
2015-12-09 23:45:23 +00:00
UPPERNODE(new_current) = current;
} else {
2015-12-09 23:45:23 +00:00
UPPERNODE(new_current) = PARENT(*root);
}
addonlevel(new_current, current, order, root);
2008-01-22 23:28:04 +00:00
rbt->nodecount++;
*nodep = new_current;
hash_node(rbt, new_current, name);
}
return (result);
}
/*
* Add a name to the tree of trees, associating it with some data.
*/
isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbt_addname(dns_rbt_t *rbt, const dns_name_t *name, void *data) {
isc_result_t result;
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *node;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(dns_name_isabsolute(name));
2008-01-22 23:28:04 +00:00
node = NULL;
2008-01-22 23:28:04 +00:00
result = dns_rbt_addnode(rbt, name, &node);
2008-01-22 23:28:04 +00:00
/*
* dns_rbt_addnode will report the node exists even when
* it does not have data associated with it, but the
* dns_rbt_*name functions all behave depending on whether
* there is data associated with a node.
*/
if (result == ISC_R_SUCCESS ||
(result == ISC_R_EXISTS && DATA(node) == NULL)) {
DATA(node) = data;
result = ISC_R_SUCCESS;
}
2008-01-22 23:28:04 +00:00
return (result);
}
/*
* Find the node for "name" in the tree of trees.
*/
isc_result_t
dns_rbt_findnode(dns_rbt_t *rbt, const dns_name_t *name, dns_name_t *foundname,
2008-01-22 23:28:04 +00:00
dns_rbtnode_t **node, dns_rbtnodechain_t *chain,
unsigned int options, dns_rbtfindcallback_t callback,
2020-02-13 14:44:37 -08:00
void *callback_arg) {
dns_rbtnode_t *current, *last_compared;
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_t localchain;
2020-02-13 14:44:37 -08:00
dns_name_t *search_name, current_name, *callback_name;
dns_fixedname_t fixedcallbackname, fixedsearchname;
dns_namereln_t compared;
isc_result_t result, saved_result;
unsigned int common_labels;
unsigned int hlabels = 0;
int order;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(dns_name_isabsolute(name));
REQUIRE(node != NULL && *node == NULL);
REQUIRE((options & (DNS_RBTFIND_NOEXACT | DNS_RBTFIND_NOPREDECESSOR)) !=
(DNS_RBTFIND_NOEXACT | DNS_RBTFIND_NOPREDECESSOR));
2008-01-22 23:28:04 +00:00
/*
* If there is a chain it needs to appear to be in a sane state,
* otherwise a chain is still needed to generate foundname and
* callback_name.
*/
if (chain == NULL) {
options |= DNS_RBTFIND_NOPREDECESSOR;
chain = &localchain;
dns_rbtnodechain_init(chain);
} else {
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_reset(chain);
}
2008-01-22 23:28:04 +00:00
if (ISC_UNLIKELY(rbt->root == NULL)) {
2008-01-22 23:28:04 +00:00
return (ISC_R_NOTFOUND);
}
/*
* Appease GCC about variables it incorrectly thinks are
* possibly used uninitialized.
*/
compared = dns_namereln_none;
last_compared = NULL;
order = 0;
2008-01-22 23:28:04 +00:00
callback_name = dns_fixedname_initname(&fixedcallbackname);
2008-01-22 23:28:04 +00:00
/*
* search_name is the name segment being sought in each tree level.
* By using a fixedname, the search_name will definitely have offsets
* for use by any splitting.
* By using dns_name_clone, no name data should be copied thanks to
* the lack of bitstring labels.
*/
search_name = dns_fixedname_initname(&fixedsearchname);
INSIST(search_name != NULL);
2008-01-22 23:28:04 +00:00
dns_name_clone(name, search_name);
dns_name_init(&current_name, NULL);
saved_result = ISC_R_SUCCESS;
current = rbt->root;
while (ISC_LIKELY(current != NULL)) {
2008-01-22 23:28:04 +00:00
NODENAME(current, &current_name);
compared = dns_name_fullcompare(search_name, &current_name,
&order, &common_labels);
/*
* last_compared is used as a shortcut to start (or
* continue rather) finding the stop-node of the search
* when hashing was used (see much below in this
* function).
*/
2008-01-22 23:28:04 +00:00
last_compared = current;
if (compared == dns_namereln_equal) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
if (compared == dns_namereln_none) {
2014-05-30 09:41:33 +10:00
/*
* Here, current is pointing at a subtree root
* node. We try to find a matching node using
* the hashtable. We can get one of 3 results
* here: (a) we locate the matching node, (b) we
* find a node to which the current node has a
* subdomain relation, (c) we fail to find (a)
* or (b).
*/
2020-02-13 14:44:37 -08:00
dns_name_t hash_name;
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *hnode;
dns_rbtnode_t *up_current;
2020-02-13 14:44:37 -08:00
unsigned int nlabels;
unsigned int tlabels = 1;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
uint32_t hash;
2008-01-22 23:28:04 +00:00
/*
* The case of current not being a subtree root,
* that means a left or right pointer was
* followed, only happens when the algorithm
* fell through to the traditional binary search
* because of a bitstring label. Since we
* dropped the bitstring support, this should
* not happen.
2008-01-22 23:28:04 +00:00
*/
INSIST(IS_ROOT(current));
2008-01-22 23:28:04 +00:00
nlabels = dns_name_countlabels(search_name);
/*
* current is the root of the current level, so
2015-12-09 19:07:20 +05:30
* its parent is the same as its "up" pointer.
2008-01-22 23:28:04 +00:00
*/
up_current = PARENT(current);
2008-01-22 23:28:04 +00:00
dns_name_init(&hash_name, NULL);
hashagain:
/*
* Compute the hash over the full absolute
* name. Look for the smallest suffix match at
* this tree level (hlevel), and then at every
* iteration, look for the next smallest suffix
* match (add another subdomain label to the
* absolute name being hashed).
2008-01-22 23:28:04 +00:00
*/
dns_name_getlabelsequence(name, nlabels - tlabels,
2008-01-22 23:28:04 +00:00
hlabels + tlabels,
&hash_name);
hash = dns_name_fullhash(&hash_name, false);
2008-01-22 23:28:04 +00:00
dns_name_getlabelsequence(search_name,
nlabels - tlabels, tlabels,
&hash_name);
2008-01-22 23:28:04 +00:00
/*
* Walk all the nodes in the hash bucket pointed
* by the computed hash value.
*/
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
for (hnode = rbt->hashtable[hash_32(hash,
rbt->hashbits)];
2020-02-13 14:44:37 -08:00
hnode != NULL; hnode = hnode->hashnext)
{
2008-01-22 23:28:04 +00:00
dns_name_t hnode_name;
if (ISC_LIKELY(hash != HASHVAL(hnode))) {
2008-01-22 23:28:04 +00:00
continue;
}
/*
* This checks that the hashed label
* sequence being looked up is at the
* same tree level, so that we don't
* match a labelsequence from some other
* subdomain.
*/
if (ISC_LIKELY(get_upper_node(hnode) !=
up_current)) {
2008-01-22 23:28:04 +00:00
continue;
}
2008-01-22 23:28:04 +00:00
dns_name_init(&hnode_name, NULL);
NODENAME(hnode, &hnode_name);
if (ISC_LIKELY(dns_name_equal(&hnode_name,
&hash_name))) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
}
if (hnode != NULL) {
current = hnode;
/*
* This is an optimization. If hashing found
* the right node, the next call to
* dns_name_fullcompare() would obviously
* return _equal or _subdomain. Determine
* which of those would be the case by
* checking if the full name was hashed. Then
* make it look like dns_name_fullcompare
* was called and jump to the right place.
*/
if (tlabels == nlabels) {
compared = dns_namereln_equal;
break;
} else {
common_labels = tlabels;
compared = dns_namereln_subdomain;
goto subdomain;
}
}
if (tlabels++ < nlabels) {
2008-01-22 23:28:04 +00:00
goto hashagain;
}
2008-01-22 23:28:04 +00:00
/*
* All of the labels have been tried against the hash
* table. Since we dropped the support of bitstring
* labels, the name isn't in the table.
*/
current = NULL;
continue;
} else {
/*
* The names have some common suffix labels.
*
* If the number in common are equal in length to
* the current node's name length, then follow the
* down pointer and search in the new tree.
*/
if (compared == dns_namereln_subdomain) {
subdomain:
2008-01-22 23:28:04 +00:00
/*
* Whack off the current node's common parts
* for the name to search in the next level.
*/
dns_name_split(search_name, common_labels,
search_name, NULL);
hlabels += common_labels;
/*
* This might be the closest enclosing name.
*/
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
if (WANTEMPTYDATA_OR_DATA(options, current)) {
2008-01-22 23:28:04 +00:00
*node = current;
}
2008-01-22 23:28:04 +00:00
/*
* Point the chain to the next level. This
* needs to be done before 'current' is pointed
* there because the callback in the next
* block of code needs the current 'current',
* but in the event the callback requests that
* the search be stopped then the
* DNS_R_PARTIALMATCH code at the end of this
* function needs the chain pointed to the
* next level.
*/
ADD_LEVEL(chain, current);
/*
* The caller may want to interrupt the
* downward search when certain special nodes
* are traversed. If this is a special node,
* the callback is used to learn what the
* caller wants to do.
*/
if (callback != NULL && FINDCALLBACK(current)) {
result = chain_name(
chain, callback_name, false);
2008-01-22 23:28:04 +00:00
if (result != ISC_R_SUCCESS) {
dns_rbtnodechain_reset(chain);
return (result);
}
result = (callback)(current,
callback_name,
callback_arg);
if (result != DNS_R_CONTINUE) {
saved_result = result;
/*
* Treat this node as if it
* had no down pointer.
*/
current = NULL;
break;
}
}
/*
* Finally, head to the next tree level.
*/
current = DOWN(current);
} else {
/*
* Though there are labels in common, the
* entire name at this node is not common
* with the search name so the search
* name does not exist in the tree.
*/
INSIST(compared ==
dns_namereln_commonancestor ||
compared == dns_namereln_contains);
2008-01-22 23:28:04 +00:00
current = NULL;
}
}
}
/*
* If current is not NULL, NOEXACT is not disallowing exact matches,
* and either the node has data or an empty node is ok, return
* ISC_R_SUCCESS to indicate an exact match.
*/
if (current != NULL && (options & DNS_RBTFIND_NOEXACT) == 0 &&
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
WANTEMPTYDATA_OR_DATA(options, current))
2020-02-13 14:44:37 -08:00
{
2008-01-22 23:28:04 +00:00
/*
* Found an exact match.
*/
chain->end = current;
chain->level_matches = chain->level_count;
if (foundname != NULL) {
result = chain_name(chain, foundname, true);
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_SUCCESS;
}
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
*node = current;
result = saved_result;
} else {
2008-01-22 23:28:04 +00:00
*node = NULL;
}
2008-01-22 23:28:04 +00:00
} else {
/*
* Did not find an exact match (or did not want one).
*/
if (*node != NULL) {
/*
* ... but found a partially matching superdomain.
* Unwind the chain to the partial match node
* to set level_matches to the level above the node,
* and then to derive the name.
*
* chain->level_count is guaranteed to be at least 1
* here because by definition of finding a superdomain,
* the chain is pointed to at least the first subtree.
*/
chain->level_matches = chain->level_count - 1;
while (chain->levels[chain->level_matches] != *node) {
INSIST(chain->level_matches > 0);
chain->level_matches--;
}
if (foundname != NULL) {
unsigned int saved_count = chain->level_count;
chain->level_count = chain->level_matches + 1;
result = chain_name(chain, foundname, false);
2008-01-22 23:28:04 +00:00
chain->level_count = saved_count;
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_SUCCESS;
}
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
result = DNS_R_PARTIALMATCH;
}
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOTFOUND;
}
2008-01-22 23:28:04 +00:00
if (current != NULL) {
/*
* There was an exact match but either
* DNS_RBTFIND_NOEXACT was set, or
* DNS_RBTFIND_EMPTYDATA was set and the node had no
* data. A policy decision was made to set the
* chain to the exact match, but this is subject
* to change if it becomes apparent that something
* else would be more useful. It is important that
* this case is handled here, because the predecessor
* setting code below assumes the match was not exact.
*/
INSIST(((options & DNS_RBTFIND_NOEXACT) != 0) ||
((options & DNS_RBTFIND_EMPTYDATA) == 0 &&
DATA(current) == NULL));
chain->end = current;
} else if ((options & DNS_RBTFIND_NOPREDECESSOR) != 0) {
/*
* Ensure the chain points nowhere.
*/
chain->end = NULL;
} else {
/*
* Since there was no exact match, the chain argument
* needs to be pointed at the DNSSEC predecessor of
* the search name.
*/
if (compared == dns_namereln_subdomain) {
/*
* Attempted to follow a down pointer that was
* NULL, which means the searched for name was
* a subdomain of a terminal name in the tree.
* Since there are no existing subdomains to
* order against, the terminal name is the
* predecessor.
*/
INSIST(chain->level_count > 0);
INSIST(chain->level_matches <
chain->level_count);
chain->end =
chain->levels[--chain->level_count];
} else {
isc_result_t result2;
/*
* Point current to the node that stopped
* the search.
*
* With the hashing modification that has been
* added to the algorithm, the stop node of a
* standard binary search is not known. So it
* has to be found. There is probably a more
* clever way of doing this.
*
* The assignment of current to NULL when
* the relationship is *not* dns_namereln_none,
* even though it later gets set to the same
* last_compared anyway, is simply to not push
* the while loop in one more level of
* indentation.
*/
if (compared == dns_namereln_none) {
2008-01-22 23:28:04 +00:00
current = last_compared;
} else {
2008-01-22 23:28:04 +00:00
current = NULL;
}
2008-01-22 23:28:04 +00:00
while (current != NULL) {
NODENAME(current, &current_name);
compared = dns_name_fullcompare(
search_name, &current_name,
&order, &common_labels);
POST(compared);
2008-01-22 23:28:04 +00:00
last_compared = current;
/*
* Standard binary search movement.
*/
if (order < 0) {
2008-01-22 23:28:04 +00:00
current = LEFT(current);
} else {
2008-01-22 23:28:04 +00:00
current = RIGHT(current);
}
2008-01-22 23:28:04 +00:00
}
current = last_compared;
/*
* Reached a point within a level tree that
* positively indicates the name is not
* present, but the stop node could be either
* less than the desired name (order > 0) or
* greater than the desired name (order < 0).
*
* If the stop node is less, it is not
* necessarily the predecessor. If the stop
* node has a down pointer, then the real
* predecessor is at the end of a level below
* (not necessarily the next level).
* Move down levels until the rightmost node
* does not have a down pointer.
*
* When the stop node is greater, it is
* the successor. All the logic for finding
* the predecessor is handily encapsulated
* in dns_rbtnodechain_prev. In the event
* that the search name is less than anything
* else in the tree, the chain is reset.
* XXX DCL What is the best way for the caller
* to know that the search name has
* no predecessor?
*/
if (order > 0) {
if (DOWN(current) != NULL) {
ADD_LEVEL(chain, current);
result2 = move_chain_to_last(
chain, DOWN(current));
2008-01-22 23:28:04 +00:00
if (result2 != ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
result = result2;
}
} else {
2008-01-22 23:28:04 +00:00
/*
* Ah, the pure and simple
* case. The stop node is the
* predecessor.
*/
chain->end = current;
}
2008-01-22 23:28:04 +00:00
} else {
INSIST(order < 0);
chain->end = current;
result2 = dns_rbtnodechain_prev(
chain, NULL, NULL);
2008-01-22 23:28:04 +00:00
if (result2 == ISC_R_SUCCESS ||
result2 == DNS_R_NEWORIGIN) {
/* Nothing. */
} else if (result2 == ISC_R_NOMORE) {
2008-01-22 23:28:04 +00:00
/*
* There is no predecessor.
*/
dns_rbtnodechain_reset(chain);
} else {
2008-01-22 23:28:04 +00:00
result = result2;
}
2008-01-22 23:28:04 +00:00
}
}
}
}
ENSURE(*node == NULL || DNS_RBTNODE_VALID(*node));
return (result);
}
/*
* Get the data pointer associated with 'name'.
*/
isc_result_t
dns_rbt_findname(dns_rbt_t *rbt, const dns_name_t *name, unsigned int options,
2020-02-13 14:44:37 -08:00
dns_name_t *foundname, void **data) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *node = NULL;
2020-02-13 14:44:37 -08:00
isc_result_t result;
2008-01-22 23:28:04 +00:00
REQUIRE(data != NULL && *data == NULL);
result = dns_rbt_findnode(rbt, name, foundname, &node, NULL, options,
NULL, NULL);
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
if (node != NULL && WANTEMPTYDATA_OR_DATA(options, node)) {
2008-01-22 23:28:04 +00:00
*data = DATA(node);
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOTFOUND;
}
2008-01-22 23:28:04 +00:00
return (result);
}
/*
* Delete a name from the tree of trees.
*/
isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbt_deletename(dns_rbt_t *rbt, const dns_name_t *name, bool recurse) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *node = NULL;
2020-02-13 14:44:37 -08:00
isc_result_t result;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(dns_name_isabsolute(name));
/*
* First, find the node.
*
* When searching, the name might not have an exact match:
* consider a.b.a.com, b.b.a.com and c.b.a.com as the only
* elements of a tree, which would make layer 1 a single
* node tree of "b.a.com" and layer 2 a three node tree of
* a, b, and c. Deleting a.com would find only a partial depth
* match in the first layer. Should it be a requirement that
* that the name to be deleted have data? For now, it is.
*
* ->dirty, ->locknum and ->references are ignored; they are
* solely the province of rbtdb.c.
*/
result = dns_rbt_findnode(rbt, name, NULL, &node, NULL,
DNS_RBTFIND_NOOPTIONS, NULL, NULL);
if (result == ISC_R_SUCCESS) {
if (DATA(node) != NULL) {
2008-01-22 23:28:04 +00:00
result = dns_rbt_deletenode(rbt, node, recurse);
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOTFOUND;
}
} else if (result == DNS_R_PARTIALMATCH) {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOTFOUND;
}
2008-01-22 23:28:04 +00:00
return (result);
}
/*
* Remove a node from the tree of trees.
*
* NOTE WELL: deletion is *not* symmetric with addition; that is, reversing
* a sequence of additions to be deletions will not generally get the
* tree back to the state it started in. For example, if the addition
* of "b.c" caused the node "a.b.c" to be split, pushing "a" to its own level,
* then the subsequent deletion of "b.c" will not cause "a" to be pulled up,
* restoring "a.b.c". The RBT *used* to do this kind of rejoining, but it
* turned out to be a bad idea because it could corrupt an active nodechain
* that had "b.c" as one of its levels -- and the RBT has no idea what
* nodechains are in use by callers, so it can't even *try* to helpfully
* fix them up (which would probably be doomed to failure anyway).
*
* Similarly, it is possible to leave the tree in a state where a supposedly
* deleted node still exists. The first case of this is obvious; take
* the tree which has "b.c" on one level, pointing to "a". Now deleted "b.c".
* It was just established in the previous paragraph why we can't pull "a"
* back up to its parent level. But what happens when "a" then gets deleted?
* "b.c" is left hanging around without data or children. This condition
* is actually pretty easy to detect, but ... should it really be removed?
* Is a chain pointing to it? An iterator? Who knows! (Note that the
* references structure member cannot be looked at because it is private to
* rbtdb.) This is ugly and makes me unhappy, but after hours of trying to
* make it more aesthetically proper and getting nowhere, this is the way it
* is going to stay until such time as it proves to be a *real* problem.
*
* Finally, for reference, note that the original routine that did node
* joining was called join_nodes(). It has been excised, living now only
* in the CVS history, but comments have been left behind that point to it just
* in case someone wants to muck with this some more.
*
* The one positive aspect of all of this is that joining used to have a
* case where it might fail. Without trying to join, now this function always
2001-11-09 01:53:20 +00:00
* succeeds. It still returns isc_result_t, though, so the API wouldn't change.
*/
isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbt_deletenode(dns_rbt_t *rbt, dns_rbtnode_t *node, bool recurse) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *parent;
REQUIRE(VALID_RBT(rbt));
REQUIRE(DNS_RBTNODE_VALID(node));
INSIST(rbt->nodecount != 0);
2008-01-22 23:28:04 +00:00
if (DOWN(node) != NULL) {
2015-12-09 19:07:20 +05:30
if (recurse) {
PARENT(DOWN(node)) = NULL;
deletetreeflat(rbt, 0, true, &DOWN(node));
2015-12-09 19:07:20 +05:30
} else {
if (DATA(node) != NULL && rbt->data_deleter != NULL) {
2008-01-22 23:28:04 +00:00
rbt->data_deleter(DATA(node), rbt->deleter_arg);
}
2008-01-22 23:28:04 +00:00
DATA(node) = NULL;
/*
* Since there is at least one node below this one and
* no recursion was requested, the deletion is
* complete. The down node from this node might be all
* by itself on a single level, so join_nodes() could
* be used to collapse the tree (with all the caveats
* of the comment at the start of this function).
2015-12-09 19:07:20 +05:30
* But join_nodes() function has now been removed.
2008-01-22 23:28:04 +00:00
*/
return (ISC_R_SUCCESS);
}
}
/*
* Note the node that points to the level of the node
* that is being deleted. If the deleted node is the
* top level, parent will be set to NULL.
*/
parent = get_upper_node(node);
2008-01-22 23:28:04 +00:00
/*
* This node now has no down pointer, so now it needs
* to be removed from this level.
*/
deletefromlevel(node, parent == NULL ? &rbt->root : &DOWN(parent));
2008-01-22 23:28:04 +00:00
if (DATA(node) != NULL && rbt->data_deleter != NULL) {
rbt->data_deleter(DATA(node), rbt->deleter_arg);
}
2008-01-22 23:28:04 +00:00
unhash_node(rbt, node);
#if DNS_RBT_USEMAGIC
node->magic = 0;
#endif /* if DNS_RBT_USEMAGIC */
isc_refcount_destroy(&node->references);
freenode(rbt, &node);
2008-01-22 23:28:04 +00:00
/*
* This function never fails.
*/
return (ISC_R_SUCCESS);
}
void
2020-02-13 14:44:37 -08:00
dns_rbt_namefromnode(dns_rbtnode_t *node, dns_name_t *name) {
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(name != NULL);
REQUIRE(name->offsets == NULL);
2008-01-22 23:28:04 +00:00
NODENAME(node, name);
}
isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbt_fullnamefromnode(dns_rbtnode_t *node, dns_name_t *name) {
dns_name_t current;
2008-01-22 23:28:04 +00:00
isc_result_t result;
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(name != NULL);
REQUIRE(name->buffer != NULL);
2008-01-22 23:28:04 +00:00
dns_name_init(&current, NULL);
dns_name_reset(name);
2008-01-22 23:28:04 +00:00
do {
INSIST(node != NULL);
2008-01-22 23:28:04 +00:00
NODENAME(node, &current);
2008-01-22 23:28:04 +00:00
result = dns_name_concatenate(name, &current, name, NULL);
if (result != ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
break;
}
node = get_upper_node(node);
} while (!dns_name_isabsolute(name));
2008-01-22 23:28:04 +00:00
return (result);
}
char *
2020-02-13 14:44:37 -08:00
dns_rbt_formatnodename(dns_rbtnode_t *node, char *printname,
unsigned int size) {
2008-01-22 23:28:04 +00:00
dns_fixedname_t fixedname;
2020-02-13 14:44:37 -08:00
dns_name_t *name;
isc_result_t result;
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(printname != NULL);
name = dns_fixedname_initname(&fixedname);
2008-01-22 23:28:04 +00:00
result = dns_rbt_fullnamefromnode(node, name);
if (result == ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
dns_name_format(name, printname, size);
} else {
2008-01-22 23:28:04 +00:00
snprintf(printname, size, "<error building name: %s>",
dns_result_totext(result));
}
2008-01-22 23:28:04 +00:00
return (printname);
}
static isc_result_t
2020-02-13 14:44:37 -08:00
create_node(isc_mem_t *mctx, const dns_name_t *name, dns_rbtnode_t **nodep) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *node;
2020-02-13 14:44:37 -08:00
isc_region_t region;
unsigned int labels;
size_t nodelen;
2008-01-22 23:28:04 +00:00
REQUIRE(name->offsets != NULL);
dns_name_toregion(name, &region);
labels = dns_name_countlabels(name);
ENSURE(labels > 0);
/*
* Allocate space for the node structure, the name, and the offsets.
*/
nodelen = sizeof(dns_rbtnode_t) + region.length + labels + 1;
node = isc_mem_get(mctx, nodelen);
memset(node, 0, nodelen);
2008-01-22 23:28:04 +00:00
node->is_root = 0;
PARENT(node) = NULL;
RIGHT(node) = NULL;
LEFT(node) = NULL;
DOWN(node) = NULL;
DATA(node) = NULL;
node->is_mmapped = 0;
node->down_is_relative = 0;
node->left_is_relative = 0;
node->right_is_relative = 0;
node->parent_is_relative = 0;
node->data_is_relative = 0;
node->rpz = 0;
2012-06-20 23:46:40 +00:00
2008-01-22 23:28:04 +00:00
HASHNEXT(node) = NULL;
HASHVAL(node) = 0;
2008-01-22 23:28:04 +00:00
ISC_LINK_INIT(node, deadlink);
LOCKNUM(node) = 0;
WILD(node) = 0;
DIRTY(node) = 0;
isc_refcount_init(&node->references, 0);
2008-01-22 23:28:04 +00:00
node->find_callback = 0;
node->nsec = DNS_RBT_NSEC_NORMAL;
2008-01-22 23:28:04 +00:00
MAKE_BLACK(node);
/*
* The following is stored to make reconstructing a name from the
* stored value in the node easy: the length of the name, the number
* of labels, whether the name is absolute or not, the name itself,
* and the name's offsets table.
*
* XXX RTH
* The offsets table could be made smaller by eliminating the
* first offset, which is always 0. This requires changes to
* lib/dns/name.c.
*
* Note: OLDOFFSETLEN *must* be assigned *after* OLDNAMELEN is assigned
* as it uses OLDNAMELEN.
2008-01-22 23:28:04 +00:00
*/
OLDNAMELEN(node) = NAMELEN(node) = region.length;
OLDOFFSETLEN(node) = OFFSETLEN(node) = labels;
2008-01-22 23:28:04 +00:00
ATTRS(node) = name->attributes;
memmove(NAME(node), region.base, region.length);
memmove(OFFSETS(node), name->offsets, labels);
#if DNS_RBT_USEMAGIC
2008-01-22 23:28:04 +00:00
node->magic = DNS_RBTNODE_MAGIC;
#endif /* if DNS_RBT_USEMAGIC */
2008-01-22 23:28:04 +00:00
*nodep = node;
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
/*
* Add a node to the hash table
*/
static inline void
2020-02-13 14:44:37 -08:00
hash_add_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
uint32_t hash;
REQUIRE(name != NULL);
HASHVAL(node) = dns_name_fullhash(name, false);
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
hash = hash_32(HASHVAL(node), rbt->hashbits);
2008-01-22 23:28:04 +00:00
HASHNEXT(node) = rbt->hashtable[hash];
2008-01-22 23:28:04 +00:00
rbt->hashtable[hash] = node;
}
/*
* Initialize hash table
*/
static isc_result_t
2020-02-13 14:44:37 -08:00
inithash(dns_rbt_t *rbt) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
size_t size;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
rbt->hashbits = RBT_HASH_MIN_BITS;
size = HASHSIZE(rbt->hashbits) * sizeof(dns_rbtnode_t *);
rbt->hashtable = isc_mem_get(rbt->mctx, size);
memset(rbt->hashtable, 0, size);
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
static uint32_t
rehash_bits(dns_rbt_t *rbt, size_t newcount) {
uint32_t newbits = rbt->hashbits;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
while (newcount >= HASHSIZE(newbits) && newbits < rbt->maxhashbits) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
newbits += 1;
}
return (newbits);
}
/*
* Rebuild the hashtable to reduce the load factor
*/
static void
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
rehash(dns_rbt_t *rbt, uint32_t newbits) {
uint32_t oldbits;
size_t oldsize;
2008-01-22 23:28:04 +00:00
dns_rbtnode_t **oldtable;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
size_t newsize;
2008-01-22 23:28:04 +00:00
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
REQUIRE(rbt->hashbits <= rbt->maxhashbits);
REQUIRE(newbits <= rbt->maxhashbits);
oldbits = rbt->hashbits;
oldsize = HASHSIZE(oldbits);
2008-01-22 23:28:04 +00:00
oldtable = rbt->hashtable;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
rbt->hashbits = newbits;
newsize = HASHSIZE(rbt->hashbits);
rbt->hashtable = isc_mem_get(rbt->mctx,
newsize * sizeof(dns_rbtnode_t *));
memset(rbt->hashtable, 0, newsize * sizeof(dns_rbtnode_t *));
2008-01-22 23:28:04 +00:00
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
for (size_t i = 0; i < oldsize; i++) {
dns_rbtnode_t *node;
dns_rbtnode_t *nextnode;
2015-12-09 19:07:20 +05:30
for (node = oldtable[i]; node != NULL; node = nextnode) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
uint32_t hash = hash_32(HASHVAL(node), rbt->hashbits);
2015-12-09 19:07:20 +05:30
nextnode = HASHNEXT(node);
2008-01-22 23:28:04 +00:00
HASHNEXT(node) = rbt->hashtable[hash];
rbt->hashtable[hash] = node;
}
}
isc_mem_put(rbt->mctx, oldtable, oldsize * sizeof(dns_rbtnode_t *));
}
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
static void
maybe_rehash(dns_rbt_t *rbt, size_t newcount) {
uint32_t newbits = rehash_bits(rbt, newcount);
if (rbt->hashbits < newbits && newbits <= rbt->maxhashbits) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
rehash(rbt, newbits);
}
}
/*
* Add a node to the hash table. Rehash the hashtable if the node count
* rises above a critical level.
*/
static inline void
2020-02-13 14:44:37 -08:00
hash_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
if (rbt->nodecount >= (HASHSIZE(rbt->hashbits) * RBT_HASH_OVERCOMMIT)) {
maybe_rehash(rbt, rbt->nodecount);
}
2008-01-22 23:28:04 +00:00
hash_add_node(rbt, node, name);
}
/*
* Remove a node from the hash table
*/
static inline void
2020-02-13 14:44:37 -08:00
unhash_node(dns_rbt_t *rbt, dns_rbtnode_t *node) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
uint32_t bucket;
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *bucket_node;
REQUIRE(DNS_RBTNODE_VALID(node));
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
bucket = hash_32(HASHVAL(node), rbt->hashbits);
2015-12-09 19:07:20 +05:30
bucket_node = rbt->hashtable[bucket];
2008-01-22 23:28:04 +00:00
2015-12-09 19:07:20 +05:30
if (bucket_node == node) {
rbt->hashtable[bucket] = HASHNEXT(node);
} else {
while (HASHNEXT(bucket_node) != node) {
INSIST(HASHNEXT(bucket_node) != NULL);
bucket_node = HASHNEXT(bucket_node);
2008-01-22 23:28:04 +00:00
}
2015-12-09 19:07:20 +05:30
HASHNEXT(bucket_node) = HASHNEXT(node);
2008-01-22 23:28:04 +00:00
}
}
static inline void
2020-02-13 14:44:37 -08:00
rotate_left(dns_rbtnode_t *node, dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child;
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(rootp != NULL);
2008-01-22 23:28:04 +00:00
child = RIGHT(node);
INSIST(child != NULL);
2008-01-22 23:28:04 +00:00
RIGHT(node) = LEFT(child);
if (LEFT(child) != NULL) {
2008-01-22 23:28:04 +00:00
PARENT(LEFT(child)) = node;
}
2008-01-22 23:28:04 +00:00
LEFT(child) = node;
PARENT(child) = PARENT(node);
2008-01-22 23:28:04 +00:00
if (IS_ROOT(node)) {
*rootp = child;
child->is_root = 1;
node->is_root = 0;
} else {
if (LEFT(PARENT(node)) == node) {
2008-01-22 23:28:04 +00:00
LEFT(PARENT(node)) = child;
} else {
2008-01-22 23:28:04 +00:00
RIGHT(PARENT(node)) = child;
}
2008-01-22 23:28:04 +00:00
}
2008-01-22 23:28:04 +00:00
PARENT(node) = child;
}
static inline void
2020-02-13 14:44:37 -08:00
rotate_right(dns_rbtnode_t *node, dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child;
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(rootp != NULL);
2008-01-22 23:28:04 +00:00
child = LEFT(node);
INSIST(child != NULL);
2008-01-22 23:28:04 +00:00
LEFT(node) = RIGHT(child);
if (RIGHT(child) != NULL) {
2008-01-22 23:28:04 +00:00
PARENT(RIGHT(child)) = node;
}
2008-01-22 23:28:04 +00:00
RIGHT(child) = node;
PARENT(child) = PARENT(node);
2008-01-22 23:28:04 +00:00
if (IS_ROOT(node)) {
*rootp = child;
child->is_root = 1;
node->is_root = 0;
} else {
if (LEFT(PARENT(node)) == node) {
2008-01-22 23:28:04 +00:00
LEFT(PARENT(node)) = child;
} else {
2008-01-22 23:28:04 +00:00
RIGHT(PARENT(node)) = child;
}
2008-01-22 23:28:04 +00:00
}
2008-01-22 23:28:04 +00:00
PARENT(node) = child;
}
/*
* This is the real workhorse of the insertion code, because it does the
* true red/black tree on a single level.
*/
static void
addonlevel(dns_rbtnode_t *node, dns_rbtnode_t *current, int order,
2020-02-13 14:44:37 -08:00
dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child, *root, *parent, *grandparent;
2020-02-13 14:44:37 -08:00
dns_name_t add_name, current_name;
dns_offsets_t add_offsets, current_offsets;
2008-01-22 23:28:04 +00:00
REQUIRE(rootp != NULL);
REQUIRE(DNS_RBTNODE_VALID(node) && LEFT(node) == NULL &&
RIGHT(node) == NULL);
REQUIRE(current != NULL);
root = *rootp;
if (root == NULL) {
/*
* First node of a level.
*/
MAKE_BLACK(node);
node->is_root = 1;
PARENT(node) = current;
*rootp = node;
return;
}
child = root;
POST(child);
2008-01-22 23:28:04 +00:00
dns_name_init(&add_name, add_offsets);
NODENAME(node, &add_name);
dns_name_init(&current_name, current_offsets);
NODENAME(current, &current_name);
if (order < 0) {
INSIST(LEFT(current) == NULL);
LEFT(current) = node;
} else {
INSIST(RIGHT(current) == NULL);
RIGHT(current) = node;
}
INSIST(PARENT(node) == NULL);
PARENT(node) = current;
MAKE_RED(node);
while (node != root && IS_RED(PARENT(node))) {
/*
* XXXDCL could do away with separate parent and grandparent
* variables. They are vestiges of the days before parent
* pointers. However, they make the code a little clearer.
*/
parent = PARENT(node);
grandparent = PARENT(parent);
if (parent == LEFT(grandparent)) {
child = RIGHT(grandparent);
if (child != NULL && IS_RED(child)) {
MAKE_BLACK(parent);
MAKE_BLACK(child);
MAKE_RED(grandparent);
node = grandparent;
} else {
if (node == RIGHT(parent)) {
rotate_left(parent, &root);
node = parent;
parent = PARENT(node);
grandparent = PARENT(parent);
}
MAKE_BLACK(parent);
MAKE_RED(grandparent);
rotate_right(grandparent, &root);
}
} else {
child = LEFT(grandparent);
if (child != NULL && IS_RED(child)) {
MAKE_BLACK(parent);
MAKE_BLACK(child);
MAKE_RED(grandparent);
node = grandparent;
} else {
if (node == LEFT(parent)) {
rotate_right(parent, &root);
node = parent;
parent = PARENT(node);
grandparent = PARENT(parent);
}
MAKE_BLACK(parent);
MAKE_RED(grandparent);
rotate_left(grandparent, &root);
}
}
}
MAKE_BLACK(root);
ENSURE(IS_ROOT(root));
*rootp = root;
return;
}
/*
* This is the real workhorse of the deletion code, because it does the
* true red/black tree on a single level.
*/
static void
2020-02-13 14:44:37 -08:00
deletefromlevel(dns_rbtnode_t *item, dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child, *sibling, *parent;
dns_rbtnode_t *successor;
REQUIRE(item != NULL);
2008-01-22 23:28:04 +00:00
/*
* Verify that the parent history is (apparently) correct.
*/
INSIST((IS_ROOT(item) && *rootp == item) ||
(!IS_ROOT(item) &&
(LEFT(PARENT(item)) == item || RIGHT(PARENT(item)) == item)));
2008-01-22 23:28:04 +00:00
child = NULL;
if (LEFT(item) == NULL) {
if (RIGHT(item) == NULL) {
if (IS_ROOT(item)) {
2008-01-22 23:28:04 +00:00
/*
* This is the only item in the tree.
*/
*rootp = NULL;
return;
}
} else {
2008-01-22 23:28:04 +00:00
/*
* This node has one child, on the right.
*/
child = RIGHT(item);
}
} else if (RIGHT(item) == NULL) {
2008-01-22 23:28:04 +00:00
/*
* This node has one child, on the left.
*/
child = LEFT(item);
} else {
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
dns_rbtnode_t *saved_parent, *saved_right;
int saved_color;
2008-01-22 23:28:04 +00:00
/*
* This node has two children, so it cannot be directly
* deleted. Find its immediate in-order successor and
* move it to this location, then do the deletion at the
* old site of the successor.
*/
successor = RIGHT(item);
while (LEFT(successor) != NULL) {
2008-01-22 23:28:04 +00:00
successor = LEFT(successor);
}
2008-01-22 23:28:04 +00:00
/*
* The successor cannot possibly have a left child;
* if there is any child, it is on the right.
*/
if (RIGHT(successor) != NULL) {
2008-01-22 23:28:04 +00:00
child = RIGHT(successor);
}
2008-01-22 23:28:04 +00:00
/*
* Swap the two nodes; it would be simpler to just replace
* the value being deleted with that of the successor,
* but this rigamarole is done so the caller has complete
* control over the pointers (and memory allocation) of
* all of nodes. If just the key value were removed from
* the tree, the pointer to the node would be unchanged.
*/
/*
* First, put the successor in the tree location of the
* node to be deleted. Save its existing tree pointer
* information, which will be needed when linking up
* delete to the successor's old location.
*/
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
saved_parent = PARENT(successor);
saved_right = RIGHT(successor);
saved_color = COLOR(successor);
2008-01-22 23:28:04 +00:00
if (IS_ROOT(item)) {
2008-01-22 23:28:04 +00:00
*rootp = successor;
successor->is_root = true;
item->is_root = false;
} else if (LEFT(PARENT(item)) == item) {
LEFT(PARENT(item)) = successor;
} else {
RIGHT(PARENT(item)) = successor;
}
2008-01-22 23:28:04 +00:00
PARENT(successor) = PARENT(item);
LEFT(successor) = LEFT(item);
RIGHT(successor) = RIGHT(item);
COLOR(successor) = COLOR(item);
2008-01-22 23:28:04 +00:00
if (LEFT(successor) != NULL) {
2008-01-22 23:28:04 +00:00
PARENT(LEFT(successor)) = successor;
}
if (RIGHT(successor) != successor) {
2008-01-22 23:28:04 +00:00
PARENT(RIGHT(successor)) = successor;
}
2008-01-22 23:28:04 +00:00
/*
* Now relink the node to be deleted into the
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
* successor's previous tree location.
2008-01-22 23:28:04 +00:00
*/
INSIST(!IS_ROOT(item));
2008-01-22 23:28:04 +00:00
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
if (saved_parent == item) {
2008-01-22 23:28:04 +00:00
/*
* Node being deleted was successor's parent.
*/
RIGHT(successor) = item;
PARENT(item) = successor;
2008-01-22 23:28:04 +00:00
} else {
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
LEFT(saved_parent) = item;
PARENT(item) = saved_parent;
2008-01-22 23:28:04 +00:00
}
/*
* Original location of successor node has no left.
*/
LEFT(item) = NULL;
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
RIGHT(item) = saved_right;
COLOR(item) = saved_color;
2008-01-22 23:28:04 +00:00
}
/*
* Remove the node by removing the links from its parent.
*/
if (!IS_ROOT(item)) {
if (LEFT(PARENT(item)) == item) {
LEFT(PARENT(item)) = child;
} else {
RIGHT(PARENT(item)) = child;
}
2008-01-22 23:28:04 +00:00
if (child != NULL) {
PARENT(child) = PARENT(item);
}
2008-01-22 23:28:04 +00:00
} else {
/*
* This is the root being deleted, and at this point
* it is known to have just one child.
*/
*rootp = child;
child->is_root = 1;
PARENT(child) = PARENT(item);
2008-01-22 23:28:04 +00:00
}
/*
* Fix color violations.
*/
if (IS_BLACK(item)) {
/* cppcheck-suppress nullPointerRedundantCheck symbolName=item
*/
parent = PARENT(item);
2008-01-22 23:28:04 +00:00
while (child != *rootp && IS_BLACK(child)) {
INSIST(child == NULL || !IS_ROOT(child));
2008-01-22 23:28:04 +00:00
if (LEFT(parent) == child) {
sibling = RIGHT(parent);
if (IS_RED(sibling)) {
MAKE_BLACK(sibling);
MAKE_RED(parent);
rotate_left(parent, rootp);
sibling = RIGHT(parent);
}
INSIST(sibling != NULL);
/* cppcheck-suppress nullPointerRedundantCheck
* symbolName=sibling */
2008-01-22 23:28:04 +00:00
if (IS_BLACK(LEFT(sibling)) &&
IS_BLACK(RIGHT(sibling))) {
MAKE_RED(sibling);
child = parent;
} else {
if (IS_BLACK(RIGHT(sibling))) {
MAKE_BLACK(LEFT(sibling));
MAKE_RED(sibling);
rotate_right(sibling, rootp);
sibling = RIGHT(parent);
}
COLOR(sibling) = COLOR(parent);
MAKE_BLACK(parent);
2012-11-30 18:50:38 +11:00
INSIST(RIGHT(sibling) != NULL);
2008-01-22 23:28:04 +00:00
MAKE_BLACK(RIGHT(sibling));
rotate_left(parent, rootp);
child = *rootp;
}
} else {
/*
* Child is parent's right child.
2009-01-17 14:45:17 +00:00
* Everything is done the same as above,
2008-01-22 23:28:04 +00:00
* except mirrored.
*/
sibling = LEFT(parent);
if (IS_RED(sibling)) {
MAKE_BLACK(sibling);
MAKE_RED(parent);
rotate_right(parent, rootp);
sibling = LEFT(parent);
}
INSIST(sibling != NULL);
/* cppcheck-suppress nullPointerRedundantCheck
* symbolName=sibling */
2008-01-22 23:28:04 +00:00
if (IS_BLACK(LEFT(sibling)) &&
IS_BLACK(RIGHT(sibling))) {
MAKE_RED(sibling);
child = parent;
} else {
if (IS_BLACK(LEFT(sibling))) {
MAKE_BLACK(RIGHT(sibling));
MAKE_RED(sibling);
rotate_left(sibling, rootp);
sibling = LEFT(parent);
}
COLOR(sibling) = COLOR(parent);
MAKE_BLACK(parent);
2012-11-30 18:50:38 +11:00
INSIST(LEFT(sibling) != NULL);
2008-01-22 23:28:04 +00:00
MAKE_BLACK(LEFT(sibling));
rotate_right(parent, rootp);
child = *rootp;
}
}
parent = PARENT(child);
}
if (IS_RED(child)) {
2008-01-22 23:28:04 +00:00
MAKE_BLACK(child);
}
2008-01-22 23:28:04 +00:00
}
}
static void
2020-02-13 14:44:37 -08:00
freenode(dns_rbt_t *rbt, dns_rbtnode_t **nodep) {
dns_rbtnode_t *node = *nodep;
*nodep = NULL;
2012-06-20 23:46:40 +00:00
if (node->is_mmapped == 0) {
isc_mem_put(rbt->mctx, node, NODE_SIZE(node));
}
2012-06-20 23:46:40 +00:00
rbt->nodecount--;
}
static void
deletetreeflat(dns_rbt_t *rbt, unsigned int quantum, bool unhash,
2020-02-13 14:44:37 -08:00
dns_rbtnode_t **nodep) {
2015-12-09 19:07:20 +05:30
dns_rbtnode_t *root = *nodep;
2008-01-22 23:28:04 +00:00
2015-12-09 19:07:20 +05:30
while (root != NULL) {
/*
* If there is a left, right or down node, walk into it
* and iterate.
*/
if (LEFT(root) != NULL) {
dns_rbtnode_t *node = root;
root = LEFT(root);
LEFT(node) = NULL;
} else if (RIGHT(root) != NULL) {
dns_rbtnode_t *node = root;
root = RIGHT(root);
RIGHT(node) = NULL;
} else if (DOWN(root) != NULL) {
dns_rbtnode_t *node = root;
root = DOWN(root);
DOWN(node) = NULL;
} else {
/*
* There are no left, right or down nodes, so we
* can free this one and go back to its parent.
*/
dns_rbtnode_t *node = root;
root = PARENT(root);
2008-01-22 23:28:04 +00:00
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
if (rbt->data_deleter != NULL && DATA(node) != NULL) {
rbt->data_deleter(DATA(node), rbt->deleter_arg);
}
if (unhash) {
2015-12-09 19:07:20 +05:30
unhash_node(rbt, node);
}
/*
* Note: we don't call unhash_node() here as we
* are destroying the complete RBT tree.
*/
#if DNS_RBT_USEMAGIC
2015-12-09 19:07:20 +05:30
node->magic = 0;
#endif /* if DNS_RBT_USEMAGIC */
2015-12-09 19:07:20 +05:30
freenode(rbt, &node);
if (quantum != 0 && --quantum == 0) {
2015-12-09 19:07:20 +05:30
break;
}
2015-12-09 19:07:20 +05:30
}
2008-01-22 23:28:04 +00:00
}
2015-12-09 19:07:20 +05:30
*nodep = root;
}
static size_t
2020-02-13 14:44:37 -08:00
getheight_helper(dns_rbtnode_t *node) {
size_t dl, dr;
size_t this_height, down_height;
if (node == NULL) {
return (0);
}
dl = getheight_helper(LEFT(node));
dr = getheight_helper(RIGHT(node));
this_height = ISC_MAX(dl + 1, dr + 1);
down_height = getheight_helper(DOWN(node));
return (ISC_MAX(this_height, down_height));
}
size_t
2020-02-13 14:44:37 -08:00
dns__rbt_getheight(dns_rbt_t *rbt) {
return (getheight_helper(rbt->root));
}
static bool
2020-02-13 14:44:37 -08:00
check_properties_helper(dns_rbtnode_t *node) {
if (node == NULL) {
return (true);
}
if (IS_RED(node)) {
/* Root nodes must be BLACK. */
if (IS_ROOT(node)) {
return (false);
}
/* Both children of RED nodes must be BLACK. */
if (IS_RED(LEFT(node)) || IS_RED(RIGHT(node))) {
return (false);
}
}
/* cppcheck-suppress nullPointerRedundantCheck symbolName=node */
if ((DOWN(node) != NULL) && (!IS_ROOT(DOWN(node)))) {
return (false);
}
if (IS_ROOT(node)) {
if ((PARENT(node) != NULL) && (DOWN(PARENT(node)) != node)) {
return (false);
}
if (get_upper_node(node) != PARENT(node)) {
return (false);
}
}
/* If node is assigned to the down_ pointer of its parent, it is
* a subtree root and must have the flag set.
*/
if (((!PARENT(node)) || (DOWN(PARENT(node)) == node)) &&
(!IS_ROOT(node))) {
return (false);
}
/* Repeat tests with this node's children. */
return (check_properties_helper(LEFT(node)) &&
check_properties_helper(RIGHT(node)) &&
check_properties_helper(DOWN(node)));
}
static bool
2020-02-13 14:44:37 -08:00
check_black_distance_helper(dns_rbtnode_t *node, size_t *distance) {
size_t dl, dr, dd;
if (node == NULL) {
*distance = 1;
return (true);
}
/* cppcheck-suppress nullPointerRedundantCheck symbolName=node */
if (!check_black_distance_helper(LEFT(node), &dl)) {
return (false);
}
/* cppcheck-suppress nullPointerRedundantCheck symbolName=node */
if (!check_black_distance_helper(RIGHT(node), &dr)) {
return (false);
}
/* cppcheck-suppress nullPointerRedundantCheck symbolName=node */
if (!check_black_distance_helper(DOWN(node), &dd)) {
return (false);
}
/* Left and right side black node counts must match. */
if (dl != dr) {
return (false);
}
if (IS_BLACK(node)) {
dl++;
}
*distance = dl;
return (true);
}
bool
2020-02-13 14:44:37 -08:00
dns__rbt_checkproperties(dns_rbt_t *rbt) {
size_t dd;
if (!check_properties_helper(rbt->root)) {
return (false);
}
/* Path from a given node to all its leaves must contain the
* same number of BLACK child nodes. This is done separately
* here instead of inside check_properties_helper() as
* it would take (n log n) complexity otherwise.
*/
return (check_black_distance_helper(rbt->root, &dd));
}
static void
2020-02-13 14:44:37 -08:00
dns_rbt_indent(FILE *f, int depth) {
2008-01-22 23:28:04 +00:00
int i;
fprintf(f, "%4d ", depth);
for (i = 0; i < depth; i++) {
fprintf(f, "- ");
}
}
void
2020-02-13 14:44:37 -08:00
dns_rbt_printnodeinfo(dns_rbtnode_t *n, FILE *f) {
2019-08-08 13:52:44 +10:00
if (n == NULL) {
fprintf(f, "Null node\n");
return;
}
fprintf(f, "Node info for nodename: ");
printnodename(n, true, f);
fprintf(f, "\n");
2012-06-20 23:46:40 +00:00
fprintf(f, "n = %p\n", n);
fprintf(f, "Relative pointers: %s%s%s%s%s\n",
(n->parent_is_relative == 1 ? " P" : ""),
(n->right_is_relative == 1 ? " R" : ""),
(n->left_is_relative == 1 ? " L" : ""),
(n->down_is_relative == 1 ? " D" : ""),
(n->data_is_relative == 1 ? " T" : ""));
2018-02-15 00:00:17 +11:00
fprintf(f, "node lock address = %u\n", n->locknum);
fprintf(f, "Parent: %p\n", n->parent);
fprintf(f, "Right: %p\n", n->right);
fprintf(f, "Left: %p\n", n->left);
fprintf(f, "Down: %p\n", n->down);
2019-08-08 13:52:44 +10:00
fprintf(f, "Data: %p\n", n->data);
}
static void
2020-02-13 14:44:37 -08:00
printnodename(dns_rbtnode_t *node, bool quoted, FILE *f) {
isc_region_t r;
dns_name_t name;
char buffer[DNS_NAME_FORMATSIZE];
2008-01-22 23:28:04 +00:00
dns_offsets_t offsets;
2008-01-22 23:28:04 +00:00
r.length = NAMELEN(node);
r.base = NAME(node);
2008-01-22 23:28:04 +00:00
dns_name_init(&name, offsets);
dns_name_fromregion(&name, &r);
2008-01-22 23:28:04 +00:00
dns_name_format(&name, buffer, sizeof(buffer));
if (quoted) {
2014-05-30 09:41:33 +10:00
fprintf(f, "\"%s\"", buffer);
} else {
2014-05-30 09:41:33 +10:00
fprintf(f, "%s", buffer);
}
}
static void
print_text_helper(dns_rbtnode_t *root, dns_rbtnode_t *parent, int depth,
const char *direction, void (*data_printer)(FILE *, void *),
2020-02-13 14:44:37 -08:00
FILE *f) {
dns_rbt_indent(f, depth);
2008-01-22 23:28:04 +00:00
if (root != NULL) {
printnodename(root, true, f);
2019-08-08 13:52:44 +10:00
/*
* Don't use IS_RED(root) as it tests for 'root != NULL'
* and cppcheck produces false positives.
*/
fprintf(f, " (%s, %s", direction,
2019-08-08 13:52:44 +10:00
COLOR(root) == RED ? "RED" : "BLACK");
2008-01-22 23:28:04 +00:00
if ((!IS_ROOT(root) && PARENT(root) != parent) ||
2020-02-13 14:44:37 -08:00
(IS_ROOT(root) && depth > 0 && DOWN(PARENT(root)) != root))
{
fprintf(f, " (BAD parent pointer! -> ");
if (PARENT(root) != NULL) {
printnodename(PARENT(root), true, f);
} else {
fprintf(f, "NULL");
}
fprintf(f, ")");
2008-01-22 23:28:04 +00:00
}
fprintf(f, ")");
2012-06-20 23:46:40 +00:00
if (root->data != NULL && data_printer != NULL) {
fprintf(f, " data@%p: ", root->data);
data_printer(f, root->data);
}
fprintf(f, "\n");
2008-01-22 23:28:04 +00:00
depth++;
2019-08-08 13:52:44 +10:00
/*
* Don't use IS_RED(root) as it tests for 'root != NULL'
* and cppcheck produces false positives.
*/
if (COLOR(root) == RED && IS_RED(LEFT(root))) {
fprintf(f, "** Red/Red color violation on left\n");
2019-08-08 13:52:44 +10:00
}
print_text_helper(LEFT(root), root, depth, "left", data_printer,
f);
2008-01-22 23:28:04 +00:00
2019-08-08 13:52:44 +10:00
/*
* Don't use IS_RED(root) as cppcheck produces false positives.
*/
if (COLOR(root) == RED && IS_RED(RIGHT(root))) {
fprintf(f, "** Red/Red color violation on right\n");
2019-08-08 13:52:44 +10:00
}
print_text_helper(RIGHT(root), root, depth, "right",
data_printer, f);
2008-01-22 23:28:04 +00:00
print_text_helper(DOWN(root), NULL, depth, "down", data_printer,
f);
} else {
fprintf(f, "NULL (%s)\n", direction);
}
}
void
2020-02-13 14:44:37 -08:00
dns_rbt_printtext(dns_rbt_t *rbt, void (*data_printer)(FILE *, void *),
FILE *f) {
REQUIRE(VALID_RBT(rbt));
print_text_helper(rbt->root, NULL, 0, "root", data_printer, f);
}
static int
print_dot_helper(dns_rbtnode_t *node, unsigned int *nodecount,
2020-02-13 14:44:37 -08:00
bool show_pointers, FILE *f) {
unsigned int l, r, d;
if (node == NULL) {
return (0);
}
l = print_dot_helper(LEFT(node), nodecount, show_pointers, f);
r = print_dot_helper(RIGHT(node), nodecount, show_pointers, f);
d = print_dot_helper(DOWN(node), nodecount, show_pointers, f);
*nodecount += 1;
fprintf(f, "node%u[label = \"<f0> |<f1> ", *nodecount);
printnodename(node, false, f);
fprintf(f, "|<f2>");
if (show_pointers) {
fprintf(f, "|<f3> n=%p|<f4> p=%p", node, PARENT(node));
}
fprintf(f, "\"] [");
if (IS_RED(node)) {
fprintf(f, "color=red");
} else {
fprintf(f, "color=black");
}
/* XXXMUKS: verify that IS_ROOT() indicates subtree root and not
* forest root.
*/
if (IS_ROOT(node)) {
fprintf(f, ",penwidth=3");
}
if (IS_EMPTY(node)) {
fprintf(f, ",style=filled,fillcolor=lightgrey");
}
fprintf(f, "];\n");
if (LEFT(node) != NULL) {
fprintf(f, "\"node%u\":f0 -> \"node%u\":f1;\n", *nodecount, l);
}
if (DOWN(node) != NULL) {
fprintf(f, "\"node%u\":f1 -> \"node%u\":f1 [penwidth=5];\n",
*nodecount, d);
}
if (RIGHT(node) != NULL) {
fprintf(f, "\"node%u\":f2 -> \"node%u\":f1;\n", *nodecount, r);
}
return (*nodecount);
}
void
2020-02-13 14:44:37 -08:00
dns_rbt_printdot(dns_rbt_t *rbt, bool show_pointers, FILE *f) {
unsigned int nodecount = 0;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
fprintf(f, "digraph g {\n");
fprintf(f, "node [shape = record,height=.1];\n");
print_dot_helper(rbt->root, &nodecount, show_pointers, f);
fprintf(f, "}\n");
}
/*
* Chain Functions
*/
void
2020-02-13 14:44:37 -08:00
dns_rbtnodechain_init(dns_rbtnodechain_t *chain) {
REQUIRE(chain != NULL);
/*
* Initialize 'chain'.
*/
chain->end = NULL;
chain->level_count = 0;
chain->level_matches = 0;
memset(chain->levels, 0, sizeof(chain->levels));
1999-04-09 15:21:15 +00:00
chain->magic = CHAIN_MAGIC;
1999-04-09 15:21:15 +00:00
}
isc_result_t
dns_rbtnodechain_current(dns_rbtnodechain_t *chain, dns_name_t *name,
2020-02-13 14:44:37 -08:00
dns_name_t *origin, dns_rbtnode_t **node) {
2008-01-22 23:28:04 +00:00
isc_result_t result = ISC_R_SUCCESS;
REQUIRE(VALID_CHAIN(chain));
if (node != NULL) {
2008-01-22 23:28:04 +00:00
*node = chain->end;
}
2008-01-22 23:28:04 +00:00
if (chain->end == NULL) {
2008-01-22 23:28:04 +00:00
return (ISC_R_NOTFOUND);
}
2008-01-22 23:28:04 +00:00
if (name != NULL) {
NODENAME(chain->end, name);
if (chain->level_count == 0) {
/*
* Names in the top level tree are all absolute.
* Always make 'name' relative.
*/
INSIST(dns_name_isabsolute(name));
/*
* This is cheaper than dns_name_getlabelsequence().
*/
name->labels--;
name->length--;
name->attributes &= ~DNS_NAMEATTR_ABSOLUTE;
}
}
if (origin != NULL) {
if (chain->level_count > 0) {
result = chain_name(chain, origin, false);
} else {
dns_name_copynf(dns_rootname, origin);
}
2008-01-22 23:28:04 +00:00
}
return (result);
1999-04-09 15:21:15 +00:00
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_prev(dns_rbtnodechain_t *chain, dns_name_t *name,
2020-02-13 14:44:37 -08:00
dns_name_t *origin) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *current, *previous, *predecessor;
2020-02-13 14:44:37 -08:00
isc_result_t result = ISC_R_SUCCESS;
bool new_origin = false;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
predecessor = NULL;
current = chain->end;
if (LEFT(current) != NULL) {
/*
* Moving left one then right as far as possible is the
* previous node, at least for this level.
*/
current = LEFT(current);
while (RIGHT(current) != NULL) {
2008-01-22 23:28:04 +00:00
current = RIGHT(current);
}
2008-01-22 23:28:04 +00:00
predecessor = current;
} else {
/*
* No left links, so move toward the root. If at any point on
* the way there the link from parent to child is a right
* link, then the parent is the previous node, at least
* for this level.
*/
while (!IS_ROOT(current)) {
2008-01-22 23:28:04 +00:00
previous = current;
current = PARENT(current);
if (RIGHT(current) == previous) {
predecessor = current;
break;
}
}
}
if (predecessor != NULL) {
/*
* Found a predecessor node in this level. It might not
* really be the predecessor, however.
*/
if (DOWN(predecessor) != NULL) {
/*
* The predecessor is really down at least one level.
* Go down and as far right as possible, and repeat
* as long as the rightmost node has a down pointer.
*/
do {
/*
* XXX DCL Need to do something about origins
* here. See whether to go down, and if so
* whether it is truly what Bob calls a
* new origin.
*/
ADD_LEVEL(chain, predecessor);
predecessor = DOWN(predecessor);
/* XXX DCL duplicated from above; clever
* way to unduplicate? */
while (RIGHT(predecessor) != NULL) {
2008-01-22 23:28:04 +00:00
predecessor = RIGHT(predecessor);
}
2008-01-22 23:28:04 +00:00
} while (DOWN(predecessor) != NULL);
/* XXX DCL probably needs work on the concept */
if (origin != NULL) {
new_origin = true;
}
2008-01-22 23:28:04 +00:00
}
} else if (chain->level_count > 0) {
/*
* Dang, didn't find a predecessor in this level.
* Got to the root of this level without having traversed
* any right links. Ascend the tree one level; the
* node that points to this tree is the predecessor.
*/
INSIST(chain->level_count > 0 && IS_ROOT(current));
predecessor = chain->levels[--chain->level_count];
/* XXX DCL probably needs work on the concept */
/*
* Don't declare an origin change when the new origin is "."
* at the top level tree, because "." is declared as the origin
* for the second level tree.
*/
if (origin != NULL &&
(chain->level_count > 0 || OFFSETLEN(predecessor) > 1)) {
new_origin = true;
}
2008-01-22 23:28:04 +00:00
}
if (predecessor != NULL) {
chain->end = predecessor;
if (new_origin) {
result = dns_rbtnodechain_current(chain, name, origin,
NULL);
if (result == ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
result = DNS_R_NEWORIGIN;
}
} else {
2008-01-22 23:28:04 +00:00
result = dns_rbtnodechain_current(chain, name, NULL,
NULL);
}
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOMORE;
}
2008-01-22 23:28:04 +00:00
return (result);
1999-04-09 15:21:15 +00:00
}
isc_result_t
dns_rbtnodechain_down(dns_rbtnodechain_t *chain, dns_name_t *name,
2020-02-13 14:44:37 -08:00
dns_name_t *origin) {
dns_rbtnode_t *current, *successor;
2020-02-13 14:44:37 -08:00
isc_result_t result = ISC_R_SUCCESS;
bool new_origin = false;
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
successor = NULL;
current = chain->end;
if (DOWN(current) != NULL) {
/*
* Don't declare an origin change when the new origin is "."
* at the second level tree, because "." is already declared
* as the origin for the top level tree.
*/
if (chain->level_count > 0 || OFFSETLEN(current) > 1) {
new_origin = true;
}
ADD_LEVEL(chain, current);
current = DOWN(current);
while (LEFT(current) != NULL) {
current = LEFT(current);
}
successor = current;
}
if (successor != NULL) {
chain->end = successor;
/*
* It is not necessary to use dns_rbtnodechain_current like
* the other functions because this function will never
* find a node in the topmost level. This is because the
* root level will never be more than one name, and everything
* in the megatree is a successor to that node, down at
* the second level or below.
*/
if (name != NULL) {
NODENAME(chain->end, name);
}
if (new_origin) {
if (origin != NULL) {
result = chain_name(chain, origin, false);
}
if (result == ISC_R_SUCCESS) {
result = DNS_R_NEWORIGIN;
}
} else {
result = ISC_R_SUCCESS;
}
} else {
result = ISC_R_NOMORE;
}
return (result);
}
isc_result_t
2020-02-13 14:44:37 -08:00
dns_rbtnodechain_nextflat(dns_rbtnodechain_t *chain, dns_name_t *name) {
dns_rbtnode_t *current, *previous, *successor;
2020-02-13 14:44:37 -08:00
isc_result_t result = ISC_R_SUCCESS;
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
successor = NULL;
current = chain->end;
if (RIGHT(current) == NULL) {
while (!IS_ROOT(current)) {
previous = current;
current = PARENT(current);
if (LEFT(current) == previous) {
successor = current;
break;
}
}
} else {
current = RIGHT(current);
while (LEFT(current) != NULL) {
current = LEFT(current);
}
successor = current;
}
if (successor != NULL) {
chain->end = successor;
if (name != NULL) {
NODENAME(chain->end, name);
}
result = ISC_R_SUCCESS;
} else {
result = ISC_R_NOMORE;
}
return (result);
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_next(dns_rbtnodechain_t *chain, dns_name_t *name,
2020-02-13 14:44:37 -08:00
dns_name_t *origin) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *current, *previous, *successor;
2020-02-13 14:44:37 -08:00
isc_result_t result = ISC_R_SUCCESS;
bool new_origin = false;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
successor = NULL;
current = chain->end;
/*
* If there is a level below this node, the next node is the leftmost
* node of the next level.
*/
if (DOWN(current) != NULL) {
/*
* Don't declare an origin change when the new origin is "."
* at the second level tree, because "." is already declared
* as the origin for the top level tree.
*/
if (chain->level_count > 0 || OFFSETLEN(current) > 1) {
new_origin = true;
}
2008-01-22 23:28:04 +00:00
ADD_LEVEL(chain, current);
current = DOWN(current);
while (LEFT(current) != NULL) {
2008-01-22 23:28:04 +00:00
current = LEFT(current);
}
2008-01-22 23:28:04 +00:00
successor = current;
} else if (RIGHT(current) == NULL) {
/*
* The successor is up, either in this level or a previous one.
* Head back toward the root of the tree, looking for any path
* that was via a left link; the successor is the node that has
* that left link. In the event the root of the level is
* reached without having traversed any left links, ascend one
* level and look for either a right link off the point of
* ascent, or search for a left link upward again, repeating
2009-01-17 14:45:17 +00:00
* ascends until either case is true.
2008-01-22 23:28:04 +00:00
*/
do {
while (!IS_ROOT(current)) {
2008-01-22 23:28:04 +00:00
previous = current;
current = PARENT(current);
if (LEFT(current) == previous) {
successor = current;
break;
}
}
if (successor == NULL) {
/*
* Reached the root without having traversed
* any left pointers, so this level is done.
*/
if (chain->level_count == 0) {
/*
* If the tree we are iterating over
* was modified since this chain was
* initialized in a way that caused
* node splits to occur, "current" may
* now be pointing to a root node which
* appears to be at level 0, but still
* has a parent. If that happens,
* abort. Otherwise, we are done
* looking for a successor as we really
* reached the root node on level 0.
*/
INSIST(PARENT(current) == NULL);
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
current = chain->levels[--chain->level_count];
new_origin = true;
2008-01-22 23:28:04 +00:00
if (RIGHT(current) != NULL) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
}
} while (successor == NULL);
}
if (successor == NULL && RIGHT(current) != NULL) {
current = RIGHT(current);
while (LEFT(current) != NULL) {
2008-01-22 23:28:04 +00:00
current = LEFT(current);
}
2008-01-22 23:28:04 +00:00
successor = current;
}
if (successor != NULL) {
/*
* If we determine that the current node is the successor to
* itself, we will run into an infinite loop, so abort instead.
*/
INSIST(chain->end != successor);
2008-01-22 23:28:04 +00:00
chain->end = successor;
/*
* It is not necessary to use dns_rbtnodechain_current like
* the other functions because this function will never
* find a node in the topmost level. This is because the
* root level will never be more than one name, and everything
* in the megatree is a successor to that node, down at
* the second level or below.
*/
if (name != NULL) {
2008-01-22 23:28:04 +00:00
NODENAME(chain->end, name);
}
2008-01-22 23:28:04 +00:00
if (new_origin) {
if (origin != NULL) {
result = chain_name(chain, origin, false);
}
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
result = DNS_R_NEWORIGIN;
}
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_SUCCESS;
}
} else {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOMORE;
}
2008-01-22 23:28:04 +00:00
return (result);
1999-04-09 15:21:15 +00:00
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_first(dns_rbtnodechain_t *chain, dns_rbt_t *rbt,
2008-01-22 23:28:04 +00:00
dns_name_t *name, dns_name_t *origin)
1999-04-09 15:21:15 +00:00
{
2008-01-22 23:28:04 +00:00
isc_result_t result;
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(VALID_CHAIN(chain));
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_reset(chain);
2008-01-22 23:28:04 +00:00
chain->end = rbt->root;
2008-01-22 23:28:04 +00:00
result = dns_rbtnodechain_current(chain, name, origin, NULL);
if (result == ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
result = DNS_R_NEWORIGIN;
}
2008-01-22 23:28:04 +00:00
return (result);
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_last(dns_rbtnodechain_t *chain, dns_rbt_t *rbt,
dns_name_t *name, dns_name_t *origin)
1999-04-09 15:21:15 +00:00
{
2008-01-22 23:28:04 +00:00
isc_result_t result;
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(VALID_CHAIN(chain));
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_reset(chain);
2008-01-22 23:28:04 +00:00
result = move_chain_to_last(chain, rbt->root);
if (result != ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
return (result);
}
2008-01-22 23:28:04 +00:00
result = dns_rbtnodechain_current(chain, name, origin, NULL);
if (result == ISC_R_SUCCESS) {
2008-01-22 23:28:04 +00:00
result = DNS_R_NEWORIGIN;
}
2008-01-22 23:28:04 +00:00
return (result);
1999-04-09 15:21:15 +00:00
}
void
2020-02-13 14:44:37 -08:00
dns_rbtnodechain_reset(dns_rbtnodechain_t *chain) {
REQUIRE(VALID_CHAIN(chain));
2008-01-22 23:28:04 +00:00
/*
* Free any dynamic storage associated with 'chain', and then
* reinitialize 'chain'.
*/
chain->end = NULL;
chain->level_count = 0;
chain->level_matches = 0;
}
1999-04-09 15:21:15 +00:00
void
2020-02-13 14:44:37 -08:00
dns_rbtnodechain_invalidate(dns_rbtnodechain_t *chain) {
2008-01-22 23:28:04 +00:00
/*
* Free any dynamic storage associated with 'chain', and then
* invalidate 'chain'.
*/
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_reset(chain);
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
chain->magic = 0;
1999-04-09 15:21:15 +00:00
}
/* XXXMUKS:
*
* - worth removing inline as static functions are inlined automatically
* where suitable by modern compilers.
* - bump the size of dns_rbt.nodecount to size_t.
* - the dumpfile header also contains a nodecount that is unsigned
* int. If large files (> 2^32 nodes) are to be supported, the
* allocation for this field should be increased.
*/