2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-08-23 02:28:55 +00:00
bind/lib/dns/rbt.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2944 lines
70 KiB
C
Raw Normal View History

/*
2018-02-15 00:00:17 +11:00
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* SPDX-License-Identifier: MPL-2.0
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
/*! \file */
1999-03-18 21:21:31 +00:00
#include <inttypes.h>
#include <stdbool.h>
#include <sys/stat.h>
#include <unistd.h>
#include <isc/file.h>
#include <isc/hash.h>
#include <isc/hex.h>
#include <isc/log.h>
#include <isc/mem.h>
#include <isc/once.h>
#include <isc/refcount.h>
#include <isc/result.h>
#include <isc/stdio.h>
#include <isc/string.h>
2000-04-28 01:08:52 +00:00
#include <isc/util.h>
#include <dns/db.h>
#include <dns/fixedname.h>
#include <dns/rbt.h>
#define CHECK(x) \
do { \
result = (x); \
if (result != ISC_R_SUCCESS) \
goto cleanup; \
} while (0)
#define RBT_MAGIC ISC_MAGIC('R', 'B', 'T', '+')
#define VALID_RBT(rbt) ISC_MAGIC_VALID(rbt, RBT_MAGIC)
/*
* XXXDCL Since parent pointers were added in again, I could remove all of the
* chain junk, and replace with dns_rbt_firstnode, _previousnode, _nextnode,
* _lastnode. This would involve pretty major change to the API.
*/
#define CHAIN_MAGIC ISC_MAGIC('0', '-', '0', '-')
#define VALID_CHAIN(chain) ISC_MAGIC_VALID(chain, CHAIN_MAGIC)
1999-04-09 15:21:15 +00:00
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
#define RBT_HASH_NEXTTABLE(hindex) ((hindex == 0) ? 1 : 0)
struct dns_rbt {
2015-12-09 19:07:20 +05:30
unsigned int magic;
isc_mem_t *mctx;
dns_rbtnode_t *root;
void (*data_deleter)(void *, void *);
void *deleter_arg;
unsigned int nodecount;
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
uint8_t hashbits[2];
dns_rbtnode_t **hashtable[2];
uint8_t hindex;
uint32_t hiter;
};
#define IS_EMPTY(node) ((node)->data == NULL)
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
#define WANTEMPTYDATA_OR_DATA(options, node) \
((options & DNS_RBTFIND_EMPTYDATA) != 0 || node->data != NULL)
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
/*%
* The variable length stuff stored after the node has the following
* structure.
*
* NAME_DATA{1..255} OLDOFFSETLEN{1} OFFSETS{1..128}
*
* NAME_DATA contains the name of the node when it was created.
* OLDOFFSETLEN contains the length of OFFSETS when the node was created.
* OFFSETS contains the offsets into name for each label when the node
* was created.
*/
#define NAME(node) ((unsigned char *)((node) + 1))
#define OFFSETS(node) (NAME(node) + node->oldnamelen + 1)
#define OLDOFFSETLEN(node) (OFFSETS(node)[-1])
#define NODE_SIZE(node) \
(sizeof(*node) + node->oldnamelen + OLDOFFSETLEN(node) + 1)
/*%
* Color management.
*/
#define RED 0
#define BLACK 1
#define IS_RED(node) ((node) != NULL && (node)->color == RED)
#define IS_BLACK(node) ((node) == NULL || (node)->color == BLACK)
/*%
1999-04-09 15:21:15 +00:00
* Chain management.
*
* The "ancestors" member of chains were removed, with their job now
2009-01-17 14:45:17 +00:00
* being wholly handled by parent pointers (which didn't exist, because
* of memory concerns, when chains were first implemented).
1999-04-09 15:21:15 +00:00
*/
#define ADD_LEVEL(chain, node) \
do { \
INSIST((chain)->level_count < DNS_RBT_LEVELBLOCK); \
(chain)->levels[(chain)->level_count++] = (node); \
} while (0)
1999-02-06 01:27:35 +00:00
/*
* Initialize a dns_name_t that refers to a node's name.
1999-02-06 01:27:35 +00:00
*/
static void
node_name(dns_rbtnode_t *node, dns_name_t *name) {
name->length = node->namelen;
name->labels = node->offsetlen;
name->ndata = NAME(node);
name->offsets = OFFSETS(node);
name->attributes = (struct dns_name_attrs){ .absolute = node->absolute,
.readonly = true };
}
1999-02-06 01:27:35 +00:00
#ifdef DEBUG
/*
* A little something to help out in GDB.
*/
dns_name_t
Name(dns_rbtnode_t *node);
dns_name_t
Name(dns_rbtnode_t *node) {
2008-01-22 23:28:04 +00:00
dns_name_t name;
2008-01-22 23:28:04 +00:00
dns_name_init(&name, NULL);
if (node != NULL) {
node_name(node, &name);
}
2008-01-22 23:28:04 +00:00
return (name);
}
#endif /* DEBUG */
2015-12-09 08:54:04 -08:00
/*
* Upper node is the parent of the root of the passed node's
2015-12-09 19:07:20 +05:30
* subtree. The passed node must not be NULL.
*/
static dns_rbtnode_t *
2015-12-09 19:07:20 +05:30
get_upper_node(dns_rbtnode_t *node) {
return (node->uppernode);
2015-12-09 19:07:20 +05:30
}
size_t
dns__rbtnode_getdistance(dns_rbtnode_t *node) {
size_t nodes = 1;
while (node != NULL) {
if (node->is_root) {
break;
}
nodes++;
node = node->parent;
}
return (nodes);
}
/*
* Forward declarations.
*/
static dns_rbtnode_t *
rbtnode_new(isc_mem_t *mctx, const dns_name_t *name);
static void
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_new(dns_rbt_t *rbt, uint8_t index, uint8_t bits);
static void
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_free(dns_rbt_t *rbt, uint8_t index);
static void
hash_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name);
static void
unhash_node(dns_rbt_t *rbt, dns_rbtnode_t *node);
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
static uint32_t
rehash_bits(dns_rbt_t *rbt, size_t newcount);
static void
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_rehash(dns_rbt_t *rbt, uint32_t newbits);
static void
hashtable_rehash_one(dns_rbt_t *rbt);
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
static void
maybe_rehash(dns_rbt_t *rbt, size_t size);
static bool
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
rehashing_in_progress(dns_rbt_t *rbt);
#define TRY_NEXTTABLE(hindex, rbt) \
(hindex == rbt->hindex && rehashing_in_progress(rbt))
static void
rotate_left(dns_rbtnode_t *node, dns_rbtnode_t **rootp);
static void
rotate_right(dns_rbtnode_t *node, dns_rbtnode_t **rootp);
static void
addonlevel(dns_rbtnode_t *node, dns_rbtnode_t *current, int order,
dns_rbtnode_t **rootp);
static void
deletefromlevel(dns_rbtnode_t *item, dns_rbtnode_t **rootp);
static void
deletetreeflat(dns_rbt_t *rbt, unsigned int quantum, bool unhash,
2015-12-09 19:07:20 +05:30
dns_rbtnode_t **nodep);
static void
printnodename(dns_rbtnode_t *node, bool quoted, FILE *f);
static void
freenode(dns_rbt_t *rbt, dns_rbtnode_t **nodep);
unsigned int
dns__rbtnode_namelen(dns_rbtnode_t *node) {
dns_name_t current;
unsigned int len = 0;
REQUIRE(DNS_RBTNODE_VALID(node));
dns_name_init(&current, NULL);
do {
if (node != NULL) {
node_name(node, &current);
len += current.length;
} else {
len += 1;
break;
}
node = get_upper_node(node);
} while (!dns_name_isabsolute(&current));
return (len);
}
unsigned int
dns__rbtnode_getsize(dns_rbtnode_t *node) {
REQUIRE(DNS_RBTNODE_VALID(node));
return (NODE_SIZE(node));
}
/*
* Initialize a red/black tree of trees.
*/
isc_result_t
dns_rbt_create(isc_mem_t *mctx, dns_rbtdeleter_t deleter, void *deleter_arg,
2008-01-22 23:28:04 +00:00
dns_rbt_t **rbtp) {
dns_rbt_t *rbt;
2008-01-22 23:28:04 +00:00
REQUIRE(mctx != NULL);
REQUIRE(rbtp != NULL && *rbtp == NULL);
REQUIRE(deleter == NULL ? deleter_arg == NULL : 1);
rbt = isc_mem_get(mctx, sizeof(*rbt));
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
*rbt = (dns_rbt_t){
.data_deleter = deleter,
.deleter_arg = deleter_arg,
};
isc_mem_attach(mctx, &rbt->mctx);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_new(rbt, 0, ISC_HASH_MIN_BITS);
2008-01-22 23:28:04 +00:00
rbt->magic = RBT_MAGIC;
2008-01-22 23:28:04 +00:00
*rbtp = rbt;
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
/*
* Deallocate a red/black tree of trees.
*/
isc_result_t
dns_rbt_destroy(dns_rbt_t **rbtp, unsigned int quantum) {
2008-01-22 23:28:04 +00:00
dns_rbt_t *rbt;
2008-01-22 23:28:04 +00:00
REQUIRE(rbtp != NULL && VALID_RBT(*rbtp));
2008-01-22 23:28:04 +00:00
rbt = *rbtp;
deletetreeflat(rbt, quantum, false, &rbt->root);
2008-01-22 23:28:04 +00:00
if (rbt->root != NULL) {
return (ISC_R_QUOTA);
}
*rbtp = NULL;
2008-01-22 23:28:04 +00:00
INSIST(rbt->nodecount == 0);
2012-06-20 23:46:40 +00:00
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
if (rbt->hashtable[0] != NULL) {
hashtable_free(rbt, 0);
}
if (rbt->hashtable[1] != NULL) {
hashtable_free(rbt, 1);
}
2008-01-22 23:28:04 +00:00
rbt->magic = 0;
isc_mem_putanddetach(&rbt->mctx, rbt, sizeof(*rbt));
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
unsigned int
dns_rbt_nodecount(dns_rbt_t *rbt) {
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
2008-01-22 23:28:04 +00:00
return (rbt->nodecount);
}
2015-12-09 19:07:20 +05:30
size_t
dns_rbt_hashsize(dns_rbt_t *rbt) {
REQUIRE(VALID_RBT(rbt));
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
uint8_t hashbits = (rbt->hashbits[0] > rbt->hashbits[1])
? rbt->hashbits[0]
: rbt->hashbits[1];
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
return (1 << hashbits);
}
static isc_result_t
chain_name(dns_rbtnodechain_t *chain, dns_name_t *name,
bool include_chain_end) {
2008-01-22 23:28:04 +00:00
dns_name_t nodename;
isc_result_t result = ISC_R_SUCCESS;
int i;
dns_name_init(&nodename, NULL);
if (include_chain_end && chain->end != NULL) {
node_name(chain->end, &nodename);
dns_name_copy(&nodename, name);
2008-01-22 23:28:04 +00:00
} else {
dns_name_reset(name);
}
2008-01-22 23:28:04 +00:00
for (i = (int)chain->level_count - 1; i >= 0; i--) {
node_name(chain->levels[i], &nodename);
2008-01-22 23:28:04 +00:00
result = dns_name_concatenate(name, &nodename, name, NULL);
if (result != ISC_R_SUCCESS) {
return (result);
}
2008-01-22 23:28:04 +00:00
}
return (result);
}
static isc_result_t
move_chain_to_last(dns_rbtnodechain_t *chain, dns_rbtnode_t *node) {
2008-01-22 23:28:04 +00:00
do {
/*
* Go as far right and then down as much as possible,
* as long as the rightmost node has a down pointer.
*/
while (node->right != NULL) {
node = node->right;
}
if (node->down == NULL) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
ADD_LEVEL(chain, node);
node = node->down;
2008-01-22 23:28:04 +00:00
} while (1);
2008-01-22 23:28:04 +00:00
chain->end = node;
2008-01-22 23:28:04 +00:00
return (ISC_R_SUCCESS);
}
/*
* Add 'name' to tree, initializing its data pointer with 'data'.
*/
isc_result_t
dns_rbt_addnode(dns_rbt_t *rbt, const dns_name_t *name, dns_rbtnode_t **nodep) {
2008-01-22 23:28:04 +00:00
/*
* Does this thing have too many variables or what?
*/
dns_rbtnode_t **root, *parent, *child, *current, *new_current;
dns_name_t *add_name, *new_name, current_name, *prefix, *suffix;
dns_fixedname_t fixedcopy, fixedprefix, fixedsuffix, fnewname;
dns_offsets_t current_offsets;
dns_namereln_t compared;
isc_result_t result = ISC_R_SUCCESS;
unsigned int level_count;
2008-01-22 23:28:04 +00:00
unsigned int common_labels;
unsigned int nlabels, hlabels;
int order;
REQUIRE(VALID_RBT(rbt));
REQUIRE(dns_name_isabsolute(name));
REQUIRE(nodep != NULL && *nodep == NULL);
/*
* Dear future BIND developer,
*
* After you have tried attempting to optimize this routine by
* using the hashtable and have realized your folly, please
* append another cross ("X") below as a warning to the next
* future BIND developer:
*
* Number of victim developers: X
*
* I wish the past developer had included such a notice.
*
* Long form: Unlike dns_rbt_findnode(), this function does not
* lend itself to be optimized using the hashtable:
*
* 1. In the subtree where the insertion occurs, this function
* needs to have the insertion point and the order where the
* lookup terminated (i.e., at the insertion point where left or
* right child is NULL). This cannot be determined from the
* hashtable, so at least in that subtree, a BST O(log N) lookup
* is necessary.
*
* 2. Our RBT nodes contain not only single labels but label
* sequences to optimize space usage. So at every level, we have
* to look for a match in the hashtable for all superdomains in
* the rest of the name we're searching. This is an O(N)
* operation at least, here N being the label size of name, each
* of which is a hashtable lookup involving dns_name_equal()
* comparisons.
*/
2008-01-22 23:28:04 +00:00
/*
* Create a copy of the name so the original name structure is
* not modified.
*/
add_name = dns_fixedname_initname(&fixedcopy);
INSIST(add_name != NULL);
2008-01-22 23:28:04 +00:00
dns_name_clone(name, add_name);
if (rbt->root == NULL) {
new_current = rbtnode_new(rbt->mctx, add_name);
rbt->nodecount++;
new_current->is_root = 1;
new_current->uppernode = NULL;
rbt->root = new_current;
*nodep = new_current;
hash_node(rbt, new_current, name);
return (ISC_R_SUCCESS);
2008-01-22 23:28:04 +00:00
}
level_count = 0;
2008-01-22 23:28:04 +00:00
prefix = dns_fixedname_initname(&fixedprefix);
suffix = dns_fixedname_initname(&fixedsuffix);
2008-01-22 23:28:04 +00:00
INSIST(prefix != NULL);
INSIST(suffix != NULL);
2008-01-22 23:28:04 +00:00
root = &rbt->root;
INSIST((*root)->is_root);
2008-01-22 23:28:04 +00:00
parent = NULL;
current = NULL;
child = *root;
dns_name_init(&current_name, current_offsets);
new_name = dns_fixedname_initname(&fnewname);
2008-01-22 23:28:04 +00:00
nlabels = dns_name_countlabels(name);
hlabels = 0;
do {
current = child;
node_name(current, &current_name);
2008-01-22 23:28:04 +00:00
compared = dns_name_fullcompare(add_name, &current_name, &order,
&common_labels);
if (compared == dns_namereln_equal) {
*nodep = current;
result = ISC_R_EXISTS;
break;
}
if (compared == dns_namereln_none) {
if (order < 0) {
parent = current;
child = current->left;
2008-01-22 23:28:04 +00:00
} else if (order > 0) {
parent = current;
child = current->right;
2008-01-22 23:28:04 +00:00
}
} else {
/*
* This name has some suffix in common with the
* name at the current node. If the name at
* the current node is shorter, that means the
* new name should be in a subtree. If the
* name at the current node is longer, that means
* the down pointer to this tree should point
* to a new tree that has the common suffix, and
* the non-common parts of these two names should
* start a new tree.
*/
hlabels += common_labels;
if (compared == dns_namereln_subdomain) {
/*
* All of the existing labels are in common,
* so the new name is in a subtree.
* Whack off the common labels for the
* not-in-common part to be searched for
* in the next level.
*/
dns_name_split(add_name, common_labels,
add_name, NULL);
/*
* Follow the down pointer (possibly NULL).
*/
root = &current->down;
2008-01-22 23:28:04 +00:00
INSIST(*root == NULL ||
((*root)->is_root &&
(*root)->parent == current));
2008-01-22 23:28:04 +00:00
parent = NULL;
child = current->down;
2008-01-22 23:28:04 +00:00
INSIST(level_count < DNS_RBT_LEVELBLOCK);
level_count++;
2008-01-22 23:28:04 +00:00
} else {
/*
* The number of labels in common is fewer
* than the number of labels at the current
* node, so the current node must be adjusted
* to have just the common suffix, and a down
* pointer made to a new tree.
*/
INSIST(compared ==
dns_namereln_commonancestor ||
compared == dns_namereln_contains);
/*
* Ensure the number of levels in the tree
* does not exceed the number of logical
* levels allowed by DNSSEC.
*
* XXXDCL need a better error result?
*/
if (level_count >= DNS_RBT_LEVELBLOCK) {
2008-01-22 23:28:04 +00:00
result = ISC_R_NOSPACE;
break;
}
/*
* Split the name into two parts, a prefix
* which is the not-in-common parts of the
* two names and a suffix that is the common
* parts of them.
*/
dns_name_split(&current_name, common_labels,
prefix, suffix);
new_current = rbtnode_new(rbt->mctx, suffix);
2008-01-22 23:28:04 +00:00
/*
* Reproduce the tree attributes of the
* current node.
*/
new_current->is_root = current->is_root;
if (current->nsec == DNS_DB_NSEC_HAS_NSEC) {
new_current->nsec = DNS_DB_NSEC_NORMAL;
} else {
new_current->nsec = current->nsec;
}
new_current->parent = current->parent;
new_current->left = current->left;
new_current->right = current->right;
new_current->color = current->color;
2008-01-22 23:28:04 +00:00
/*
* Fix pointers that were to the current node.
*/
if (parent != NULL) {
if (parent->left == current) {
parent->left = new_current;
2008-01-22 23:28:04 +00:00
} else {
parent->right = new_current;
}
2008-01-22 23:28:04 +00:00
}
if (new_current->left != NULL) {
new_current->left->parent = new_current;
}
if (new_current->right != NULL) {
new_current->right->parent =
2008-01-22 23:28:04 +00:00
new_current;
}
2008-01-22 23:28:04 +00:00
if (*root == current) {
*root = new_current;
}
2008-01-22 23:28:04 +00:00
current->namelen = prefix->length;
current->offsetlen = prefix->labels;
2008-01-22 23:28:04 +00:00
/*
* Set up the new root of the next level.
* By definition it will not be the top
* level tree, so clear the absolute flag.
2008-01-22 23:28:04 +00:00
*/
current->is_root = 1;
current->parent = new_current;
new_current->down = current;
root = &new_current->down;
new_current->uppernode = current->uppernode;
current->uppernode = new_current;
INSIST(level_count < DNS_RBT_LEVELBLOCK);
level_count++;
2008-01-22 23:28:04 +00:00
current->left = NULL;
current->right = NULL;
2008-01-22 23:28:04 +00:00
current->color = BLACK;
current->absolute = false;
2008-01-22 23:28:04 +00:00
rbt->nodecount++;
dns_name_getlabelsequence(name,
nlabels - hlabels,
hlabels, new_name);
hash_node(rbt, new_current, new_name);
if (common_labels ==
2022-11-02 19:33:14 +01:00
dns_name_countlabels(add_name))
{
2008-01-22 23:28:04 +00:00
/*
* The name has been added by pushing
* the not-in-common parts down to
* a new level.
*/
*nodep = new_current;
return (ISC_R_SUCCESS);
} else {
/*
* The current node has no data,
* because it is just a placeholder.
* Its data pointer is already NULL
* from rbtnode_new()), so there's
2008-01-22 23:28:04 +00:00
* nothing more to do to it.
*
2008-01-22 23:28:04 +00:00
* The not-in-common parts of the new
* name will be inserted into the new
* level following this loop.
2008-01-22 23:28:04 +00:00
*/
dns_name_split(add_name, common_labels,
add_name, NULL);
result = ISC_R_SUCCESS;
2008-01-22 23:28:04 +00:00
break;
}
}
}
} while (child != NULL);
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
new_current = rbtnode_new(rbt->mctx, add_name);
}
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
if (*root == NULL) {
new_current->uppernode = current;
} else {
new_current->uppernode = (*root)->parent;
}
addonlevel(new_current, current, order, root);
2008-01-22 23:28:04 +00:00
rbt->nodecount++;
*nodep = new_current;
hash_node(rbt, new_current, name);
}
return (result);
}
/*
* Find the node for "name" in the tree of trees.
*/
isc_result_t
dns__rbt_findnode(dns_rbt_t *rbt, const dns_name_t *name, dns_name_t *foundname,
dns_rbtnode_t **node, dns_rbtnodechain_t *chain,
unsigned int options, dns_rbtfindcallback_t callback,
void *callback_arg DNS__DB_FLARG) {
dns_rbtnode_t *current, *last_compared;
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_t localchain;
dns_name_t *search_name, current_name, *callback_name;
dns_fixedname_t fixedcallbackname, fixedsearchname;
dns_namereln_t compared;
isc_result_t result, saved_result;
unsigned int common_labels;
unsigned int hlabels = 0;
int order;
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
uint8_t hindex;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(dns_name_isabsolute(name));
REQUIRE(node != NULL && *node == NULL);
REQUIRE((options & (DNS_RBTFIND_NOEXACT | DNS_RBTFIND_NOPREDECESSOR)) !=
(DNS_RBTFIND_NOEXACT | DNS_RBTFIND_NOPREDECESSOR));
/*
* If there is a chain it needs to appear to be in a sane state,
* otherwise a chain is still needed to generate foundname and
* callback_name.
*/
if (chain == NULL) {
options |= DNS_RBTFIND_NOPREDECESSOR;
chain = &localchain;
dns_rbtnodechain_init(chain);
2008-01-22 23:28:04 +00:00
} else {
dns_rbtnodechain_reset(chain);
}
2008-01-22 23:28:04 +00:00
if (rbt->root == NULL) {
2008-01-22 23:28:04 +00:00
return (ISC_R_NOTFOUND);
}
/*
* Appease GCC about variables it incorrectly thinks are
* possibly used uninitialized.
*/
compared = dns_namereln_none;
last_compared = NULL;
order = 0;
2008-01-22 23:28:04 +00:00
callback_name = dns_fixedname_initname(&fixedcallbackname);
2008-01-22 23:28:04 +00:00
/*
* search_name is the name segment being sought in each tree level.
* By using a fixedname, the search_name will definitely have offsets
* for use by any splitting. By using dns_name_clone, no name data
* should be copied.
2008-01-22 23:28:04 +00:00
*/
search_name = dns_fixedname_initname(&fixedsearchname);
INSIST(search_name != NULL);
2008-01-22 23:28:04 +00:00
dns_name_clone(name, search_name);
dns_name_init(&current_name, NULL);
saved_result = ISC_R_SUCCESS;
current = rbt->root;
while (current != NULL) {
node_name(current, &current_name);
2008-01-22 23:28:04 +00:00
compared = dns_name_fullcompare(search_name, &current_name,
&order, &common_labels);
/*
* last_compared is used as a shortcut to start (or
* continue rather) finding the stop-node of the search
* when hashing was used (see much below in this
* function).
*/
2008-01-22 23:28:04 +00:00
last_compared = current;
if (compared == dns_namereln_equal) {
break;
}
2008-01-22 23:28:04 +00:00
if (compared == dns_namereln_none) {
2014-05-30 09:41:33 +10:00
/*
* Here, current is pointing at a subtree root
* node. We try to find a matching node using
* the hashtable. We can get one of 3 results
* here: (a) we locate the matching node, (b) we
* find a node to which the current node has a
* subdomain relation, (c) we fail to find (a)
* or (b).
*/
2008-01-22 23:28:04 +00:00
dns_name_t hash_name;
dns_rbtnode_t *hnode;
dns_rbtnode_t *up_current;
unsigned int nlabels;
unsigned int tlabels = 1;
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
uint32_t hashval;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
uint32_t hash;
2008-01-22 23:28:04 +00:00
/*
* The case of current not being a subtree root,
* that means a left or right pointer was
* followed, only happens when the algorithm
* fell through to the traditional binary search
* because of a bitstring label. Since we
* dropped the bitstring support, this should
* not happen.
2008-01-22 23:28:04 +00:00
*/
INSIST(current->is_root);
2008-01-22 23:28:04 +00:00
nlabels = dns_name_countlabels(search_name);
/*
* current is the root of the current level, so
2015-12-09 19:07:20 +05:30
* its parent is the same as its "up" pointer.
2008-01-22 23:28:04 +00:00
*/
up_current = current->parent;
2008-01-22 23:28:04 +00:00
dns_name_init(&hash_name, NULL);
hashagain:
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hindex = rbt->hindex;
2008-01-22 23:28:04 +00:00
/*
* Compute the hash over the full absolute
* name. Look for the smallest suffix match at
* this tree level (hlevel), and then at every
* iteration, look for the next smallest suffix
* match (add another subdomain label to the
* absolute name being hashed).
2008-01-22 23:28:04 +00:00
*/
dns_name_getlabelsequence(name, nlabels - tlabels,
hlabels + tlabels,
&hash_name);
hashval = dns_name_hash(&hash_name);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
2008-01-22 23:28:04 +00:00
dns_name_getlabelsequence(search_name,
nlabels - tlabels, tlabels,
&hash_name);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
nexttable:
/*
* Walk all the nodes in the hash bucket pointed
* by the computed hash value.
*/
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hash = isc_hash_bits32(hashval, rbt->hashbits[hindex]);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
for (hnode = rbt->hashtable[hindex][hash];
hnode != NULL; hnode = hnode->hashnext)
2008-01-22 23:28:04 +00:00
{
dns_name_t hnode_name;
if (hashval != hnode->hashval) {
2008-01-22 23:28:04 +00:00
continue;
}
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* This checks that the hashed label sequence
* being looked up is at the same tree level, so
* that we don't match a labelsequence from some
* other subdomain.
*/
if (get_upper_node(hnode) != up_current) {
2008-01-22 23:28:04 +00:00
continue;
}
2008-01-22 23:28:04 +00:00
dns_name_init(&hnode_name, NULL);
node_name(hnode, &hnode_name);
if (dns_name_equal(&hnode_name, &hash_name)) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
}
if (hnode != NULL) {
current = hnode;
/*
* This is an optimization. If hashing found
* the right node, the next call to
* dns_name_fullcompare() would obviously
* return _equal or _subdomain. Determine
* which of those would be the case by
* checking if the full name was hashed. Then
* make it look like dns_name_fullcompare
* was called and jump to the right place.
*/
if (tlabels == nlabels) {
compared = dns_namereln_equal;
break;
} else {
common_labels = tlabels;
compared = dns_namereln_subdomain;
goto subdomain;
}
}
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
if (TRY_NEXTTABLE(hindex, rbt)) {
/*
* Rehashing in progress, check the other table
*/
hindex = RBT_HASH_NEXTTABLE(rbt->hindex);
goto nexttable;
}
2008-01-22 23:28:04 +00:00
if (tlabels++ < nlabels) {
goto hashagain;
}
2008-01-22 23:28:04 +00:00
/*
* All of the labels have been tried against the hash
* table.
2008-01-22 23:28:04 +00:00
*/
current = NULL;
continue;
} else {
/*
* The names have some common suffix labels.
*
* If the number in common are equal in length to
* the current node's name length, then follow the
* down pointer and search in the new tree.
*/
if (compared == dns_namereln_subdomain) {
subdomain:
2008-01-22 23:28:04 +00:00
/*
* Whack off the current node's common parts
* for the name to search in the next level.
*/
dns_name_split(search_name, common_labels,
search_name, NULL);
hlabels += common_labels;
/*
* This might be the closest enclosing name.
*/
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
if (WANTEMPTYDATA_OR_DATA(options, current)) {
2008-01-22 23:28:04 +00:00
*node = current;
}
2008-01-22 23:28:04 +00:00
/*
* Point the chain to the next level. This
* needs to be done before 'current' is pointed
* there because the callback in the next
* block of code needs the current 'current',
* but in the event the callback requests that
* the search be stopped then the
* DNS_R_PARTIALMATCH code at the end of this
* function needs the chain pointed to the
* next level.
*/
ADD_LEVEL(chain, current);
/*
* The caller may want to interrupt the
* downward search when certain special nodes
* are traversed. If this is a special node,
* the callback is used to learn what the
* caller wants to do.
*/
if (callback != NULL && current->find_callback)
{
2008-01-22 23:28:04 +00:00
result = chain_name(
chain, callback_name, false);
if (result != ISC_R_SUCCESS) {
dns_rbtnodechain_reset(chain);
return (result);
}
result =
(callback)(current,
callback_name,
callback_arg
DNS__DB_FLARG_PASS);
2008-01-22 23:28:04 +00:00
if (result != DNS_R_CONTINUE) {
saved_result = result;
/*
* Treat this node as if it
* had no down pointer.
*/
current = NULL;
break;
}
}
/*
* Finally, head to the next tree level.
*/
current = current->down;
2008-01-22 23:28:04 +00:00
} else {
/*
* Though there are labels in common, the
* entire name at this node is not common
* with the search name so the search
* name does not exist in the tree.
*/
INSIST(compared ==
dns_namereln_commonancestor ||
compared == dns_namereln_contains);
current = NULL;
}
}
}
/*
* If current is not NULL, NOEXACT is not disallowing exact matches,
* and either the node has data or an empty node is ok, return
* ISC_R_SUCCESS to indicate an exact match.
*/
if (current != NULL && (options & DNS_RBTFIND_NOEXACT) == 0 &&
Only test node->data if we care about whether data is present or not. WARNING: ThreadSanitizer: data race (pid=28788) Write of size 8 at 0x7b200002e060 by thread T1 (mutexes: write M2947): #0 add32 /builds/isc-projects/bind9/lib/dns/rbtdb.c:6638:18 (libdns.so.1110+0xe7843) #1 addrdataset /builds/isc-projects/bind9/lib/dns/rbtdb.c:6975:12 (libdns.so.1110+0xe4185) #2 dns_db_addrdataset /builds/isc-projects/bind9/lib/dns/db.c:783:10 (libdns.so.1110+0x650ee) #3 validated /builds/isc-projects/bind9/lib/dns/resolver.c:5140:11 (libdns.so.1110+0x1909f7) #4 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #5 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749) Previous read of size 8 at 0x7b200002e060 by thread T5 (mutexes: write M521146194917735760): #0 dns_rbt_findnode /builds/isc-projects/bind9/lib/dns/rbt.c:1708:9 (libdns.so.1110+0xd910d) #1 cache_find /builds/isc-projects/bind9/lib/dns/rbtdb.c:5098:11 (libdns.so.1110+0xe188e) #2 dns_db_find /builds/isc-projects/bind9/lib/dns/db.c:554:11 (libdns.so.1110+0x642bb) #3 dns_view_find2 /builds/isc-projects/bind9/lib/dns/view.c:1068:11 (libdns.so.1110+0x1cc2c4) #4 dbfind_name /builds/isc-projects/bind9/lib/dns/adb.c:3714:11 (libdns.so.1110+0x46a4b) #5 dns_adb_createfind2 /builds/isc-projects/bind9/lib/dns/adb.c:3133:12 (libdns.so.1110+0x45278) #6 findname /builds/isc-projects/bind9/lib/dns/resolver.c:3166:11 (libdns.so.1110+0x1827f0) #7 fctx_getaddresses /builds/isc-projects/bind9/lib/dns/resolver.c:3462:3 (libdns.so.1110+0x18032d) #8 fctx_try /builds/isc-projects/bind9/lib/dns/resolver.c:3819:12 (libdns.so.1110+0x17e174) #9 fctx_start /builds/isc-projects/bind9/lib/dns/resolver.c:4219:4 (libdns.so.1110+0x1787a3) #10 dispatch /builds/isc-projects/bind9/lib/isc/task.c:1157:7 (libisc.so.1107+0x507f5) #11 run /builds/isc-projects/bind9/lib/isc/task.c:1331:2 (libisc.so.1107+0x4d749)
2020-08-26 16:24:13 +10:00
WANTEMPTYDATA_OR_DATA(options, current))
{
2008-01-22 23:28:04 +00:00
/*
* Found an exact match.
*/
chain->end = current;
chain->level_matches = chain->level_count;
if (foundname != NULL) {
result = chain_name(chain, foundname, true);
2008-01-22 23:28:04 +00:00
} else {
result = ISC_R_SUCCESS;
}
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
*node = current;
result = saved_result;
} else {
*node = NULL;
}
2008-01-22 23:28:04 +00:00
} else {
/*
* Did not find an exact match (or did not want one).
*/
if (*node != NULL) {
/*
* ... but found a partially matching superdomain.
* Unwind the chain to the partial match node
* to set level_matches to the level above the node,
* and then to derive the name.
*
* chain->level_count is guaranteed to be at least 1
* here because by definition of finding a superdomain,
* the chain is pointed to at least the first subtree.
*/
chain->level_matches = chain->level_count - 1;
while (chain->levels[chain->level_matches] != *node) {
INSIST(chain->level_matches > 0);
chain->level_matches--;
}
if (foundname != NULL) {
unsigned int saved_count = chain->level_count;
chain->level_count = chain->level_matches + 1;
result = chain_name(chain, foundname, false);
chain->level_count = saved_count;
} else {
result = ISC_R_SUCCESS;
}
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
result = DNS_R_PARTIALMATCH;
}
2008-01-22 23:28:04 +00:00
} else {
result = ISC_R_NOTFOUND;
}
2008-01-22 23:28:04 +00:00
if (current != NULL) {
/*
* There was an exact match but either
* DNS_RBTFIND_NOEXACT was set, or
* DNS_RBTFIND_EMPTYDATA was set and the node had no
* data. A policy decision was made to set the
* chain to the exact match, but this is subject
* to change if it becomes apparent that something
* else would be more useful. It is important that
* this case is handled here, because the predecessor
* setting code below assumes the match was not exact.
*/
INSIST(((options & DNS_RBTFIND_NOEXACT) != 0) ||
((options & DNS_RBTFIND_EMPTYDATA) == 0 &&
current->data == NULL));
2008-01-22 23:28:04 +00:00
chain->end = current;
} else if ((options & DNS_RBTFIND_NOPREDECESSOR) != 0) {
/*
* Ensure the chain points nowhere.
*/
chain->end = NULL;
} else {
/*
* Since there was no exact match, the chain argument
* needs to be pointed at the DNSSEC predecessor of
* the search name.
*/
if (compared == dns_namereln_subdomain) {
/*
* Attempted to follow a down pointer that was
* NULL, which means the searched for name was
* a subdomain of a terminal name in the tree.
* Since there are no existing subdomains to
* order against, the terminal name is the
* predecessor.
*/
INSIST(chain->level_count > 0);
INSIST(chain->level_matches <
chain->level_count);
chain->end =
chain->levels[--chain->level_count];
} else {
isc_result_t result2;
/*
* Point current to the node that stopped
* the search.
*
* With the hashing modification that has been
* added to the algorithm, the stop node of a
* standard binary search is not known. So it
* has to be found. There is probably a more
* clever way of doing this.
*
* The assignment of current to NULL when
* the relationship is *not* dns_namereln_none,
* even though it later gets set to the same
* last_compared anyway, is simply to not push
* the while loop in one more level of
* indentation.
*/
if (compared == dns_namereln_none) {
current = last_compared;
} else {
current = NULL;
}
2008-01-22 23:28:04 +00:00
while (current != NULL) {
node_name(current, &current_name);
2008-01-22 23:28:04 +00:00
compared = dns_name_fullcompare(
search_name, &current_name,
&order, &common_labels);
POST(compared);
2008-01-22 23:28:04 +00:00
last_compared = current;
/*
* Standard binary search movement.
*/
if (order < 0) {
current = current->left;
2008-01-22 23:28:04 +00:00
} else {
current = current->right;
}
2008-01-22 23:28:04 +00:00
}
current = last_compared;
/*
* Reached a point within a level tree that
* positively indicates the name is not
* present, but the stop node could be either
* less than the desired name (order > 0) or
* greater than the desired name (order < 0).
*
* If the stop node is less, it is not
* necessarily the predecessor. If the stop
* node has a down pointer, then the real
* predecessor is at the end of a level below
* (not necessarily the next level).
* Move down levels until the rightmost node
* does not have a down pointer.
*
* When the stop node is greater, it is
* the successor. All the logic for finding
* the predecessor is handily encapsulated
* in dns_rbtnodechain_prev. In the event
* that the search name is less than anything
* else in the tree, the chain is reset.
* XXX DCL What is the best way for the caller
* to know that the search name has
* no predecessor?
*/
if (order > 0) {
if (current->down != NULL) {
2008-01-22 23:28:04 +00:00
ADD_LEVEL(chain, current);
result2 = move_chain_to_last(
chain, current->down);
2008-01-22 23:28:04 +00:00
if (result2 != ISC_R_SUCCESS) {
result = result2;
}
2008-01-22 23:28:04 +00:00
} else {
/*
* Ah, the pure and simple
* case. The stop node is the
* predecessor.
*/
chain->end = current;
}
2008-01-22 23:28:04 +00:00
} else {
INSIST(order < 0);
chain->end = current;
result2 = dns_rbtnodechain_prev(
chain, NULL, NULL);
if (result2 == ISC_R_SUCCESS ||
2022-11-02 19:33:14 +01:00
result2 == DNS_R_NEWORIGIN)
{
2008-01-22 23:28:04 +00:00
/* Nothing. */
} else if (result2 == ISC_R_NOMORE) {
/*
* There is no predecessor.
*/
dns_rbtnodechain_reset(chain);
} else {
result = result2;
}
2008-01-22 23:28:04 +00:00
}
}
}
}
ENSURE(*node == NULL || DNS_RBTNODE_VALID(*node));
return (result);
}
/*
* Remove a node from the tree of trees.
*
* NOTE WELL: deletion is *not* symmetric with addition; that is, reversing
* a sequence of additions to be deletions will not generally get the
* tree back to the state it started in. For example, if the addition
* of "b.c" caused the node "a.b.c" to be split, pushing "a" to its own level,
* then the subsequent deletion of "b.c" will not cause "a" to be pulled up,
* restoring "a.b.c". The RBT *used* to do this kind of rejoining, but it
* turned out to be a bad idea because it could corrupt an active nodechain
* that had "b.c" as one of its levels -- and the RBT has no idea what
* nodechains are in use by callers, so it can't even *try* to helpfully
* fix them up (which would probably be doomed to failure anyway).
*
* Similarly, it is possible to leave the tree in a state where a supposedly
* deleted node still exists. The first case of this is obvious; take
* the tree which has "b.c" on one level, pointing to "a". Now deleted "b.c".
* It was just established in the previous paragraph why we can't pull "a"
* back up to its parent level. But what happens when "a" then gets deleted?
* "b.c" is left hanging around without data or children. This condition
* is actually pretty easy to detect, but ... should it really be removed?
* Is a chain pointing to it? An iterator? Who knows! (Note that the
* references structure member cannot be looked at because it is private to
* rbtdb.) This is ugly and makes me unhappy, but after hours of trying to
* make it more aesthetically proper and getting nowhere, this is the way it
* is going to stay until such time as it proves to be a *real* problem.
*
* Finally, for reference, note that the original routine that did node
* joining was called join_nodes(). It has been excised, living now only
* in the CVS history, but comments have been left behind that point to it just
* in case someone wants to muck with this some more.
*
* The one positive aspect of all of this is that joining used to have a
* case where it might fail. Without trying to join, now this function always
2001-11-09 01:53:20 +00:00
* succeeds. It still returns isc_result_t, though, so the API wouldn't change.
*/
isc_result_t
dns_rbt_deletenode(dns_rbt_t *rbt, dns_rbtnode_t *node, bool recurse) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *parent;
REQUIRE(VALID_RBT(rbt));
REQUIRE(DNS_RBTNODE_VALID(node));
INSIST(rbt->nodecount != 0);
2008-01-22 23:28:04 +00:00
if (node->down != NULL) {
2015-12-09 19:07:20 +05:30
if (recurse) {
node->down->parent = NULL;
deletetreeflat(rbt, 0, true, &node->down);
2015-12-09 19:07:20 +05:30
} else {
if (node->data != NULL && rbt->data_deleter != NULL) {
rbt->data_deleter(node->data, rbt->deleter_arg);
}
node->data = NULL;
2008-01-22 23:28:04 +00:00
/*
* Since there is at least one node below this one and
* no recursion was requested, the deletion is
* complete. The down node from this node might be all
* by itself on a single level, so join_nodes() could
* be used to collapse the tree (with all the caveats
* of the comment at the start of this function).
2015-12-09 19:07:20 +05:30
* But join_nodes() function has now been removed.
2008-01-22 23:28:04 +00:00
*/
return (ISC_R_SUCCESS);
}
}
/*
* Note the node that points to the level of the node
* that is being deleted. If the deleted node is the
* top level, parent will be set to NULL.
*/
parent = get_upper_node(node);
2008-01-22 23:28:04 +00:00
/*
* This node now has no down pointer, so now it needs
* to be removed from this level.
*/
deletefromlevel(node, parent == NULL ? &rbt->root : &parent->down);
2008-01-22 23:28:04 +00:00
if (node->data != NULL && rbt->data_deleter != NULL) {
rbt->data_deleter(node->data, rbt->deleter_arg);
}
2008-01-22 23:28:04 +00:00
unhash_node(rbt, node);
#if DNS_RBT_USEMAGIC
node->magic = 0;
#endif /* if DNS_RBT_USEMAGIC */
isc_refcount_destroy(&node->references);
freenode(rbt, &node);
2008-01-22 23:28:04 +00:00
/*
* This function never fails.
*/
return (ISC_R_SUCCESS);
}
void
dns_rbt_namefromnode(dns_rbtnode_t *node, dns_name_t *name) {
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(name != NULL);
REQUIRE(name->offsets == NULL);
node_name(node, name);
}
isc_result_t
dns_rbt_fullnamefromnode(dns_rbtnode_t *node, dns_name_t *name) {
2008-01-22 23:28:04 +00:00
dns_name_t current;
isc_result_t result;
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(name != NULL);
REQUIRE(name->buffer != NULL);
2008-01-22 23:28:04 +00:00
dns_name_init(&current, NULL);
dns_name_reset(name);
2008-01-22 23:28:04 +00:00
do {
INSIST(node != NULL);
node_name(node, &current);
2008-01-22 23:28:04 +00:00
result = dns_name_concatenate(name, &current, name, NULL);
if (result != ISC_R_SUCCESS) {
break;
}
node = get_upper_node(node);
2008-01-22 23:28:04 +00:00
} while (!dns_name_isabsolute(name));
2008-01-22 23:28:04 +00:00
return (result);
}
char *
dns_rbt_formatnodename(dns_rbtnode_t *node, char *printname,
unsigned int size) {
2008-01-22 23:28:04 +00:00
dns_fixedname_t fixedname;
dns_name_t *name;
isc_result_t result;
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(printname != NULL);
name = dns_fixedname_initname(&fixedname);
2008-01-22 23:28:04 +00:00
result = dns_rbt_fullnamefromnode(node, name);
if (result == ISC_R_SUCCESS) {
dns_name_format(name, printname, size);
} else {
snprintf(printname, size, "<error building name: %s>",
isc_result_totext(result));
}
2008-01-22 23:28:04 +00:00
return (printname);
}
static dns_rbtnode_t *
rbtnode_new(isc_mem_t *mctx, const dns_name_t *name) {
dns_rbtnode_t *node = NULL;
2008-01-22 23:28:04 +00:00
isc_region_t region;
unsigned int labels;
size_t nodelen;
2008-01-22 23:28:04 +00:00
REQUIRE(name->offsets != NULL);
dns_name_toregion(name, &region);
labels = dns_name_countlabels(name);
ENSURE(labels > 0);
/*
* Allocate space for the node structure, the name, and the offsets.
*/
nodelen = sizeof(dns_rbtnode_t) + region.length + labels + 1;
node = isc_mem_get(mctx, nodelen);
*node = (dns_rbtnode_t){
.color = BLACK,
.nsec = DNS_DB_NSEC_NORMAL,
};
2008-01-22 23:28:04 +00:00
ISC_LINK_INIT(node, deadlink);
isc_refcount_init(&node->references, 0);
2008-01-22 23:28:04 +00:00
/*
* The following is stored to make reconstructing a name from the
* stored value in the node easy: the length of the name, the number
* of labels, whether the name is absolute or not, the name itself,
* and the name's offsets table.
*
* XXX RTH
* The offsets table could be made smaller by eliminating the
* first offset, which is always 0. This requires changes to
* lib/dns/name.c.
*
* Note: OLDOFFSETLEN *must* be assigned *after* OLDNAMELEN is assigned
* as it uses OLDNAMELEN.
2008-01-22 23:28:04 +00:00
*/
node->oldnamelen = node->namelen = region.length;
OLDOFFSETLEN(node) = node->offsetlen = labels;
node->absolute = name->attributes.absolute;
2008-01-22 23:28:04 +00:00
memmove(NAME(node), region.base, region.length);
memmove(OFFSETS(node), name->offsets, labels);
#if DNS_RBT_USEMAGIC
2008-01-22 23:28:04 +00:00
node->magic = DNS_RBTNODE_MAGIC;
#endif /* if DNS_RBT_USEMAGIC */
return (node);
}
/*
* Add a node to the hash table
*/
static void
hash_add_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
uint32_t hash;
REQUIRE(name != NULL);
node->hashval = dns_name_hash(name);
hash = isc_hash_bits32(node->hashval, rbt->hashbits[rbt->hindex]);
node->hashnext = rbt->hashtable[rbt->hindex][hash];
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
rbt->hashtable[rbt->hindex][hash] = node;
}
/*
* Initialize hash table
*/
static void
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_new(dns_rbt_t *rbt, uint8_t index, uint8_t bits) {
REQUIRE(rbt->hashbits[index] == 0U);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
REQUIRE(rbt->hashtable[index] == NULL);
REQUIRE(bits >= ISC_HASH_MIN_BITS);
REQUIRE(bits < ISC_HASH_MAX_BITS);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
rbt->hashbits[index] = bits;
rbt->hashtable[index] = isc_mem_cget(rbt->mctx,
ISC_HASHSIZE(rbt->hashbits[index]),
sizeof(dns_rbtnode_t *));
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
}
static void
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_free(dns_rbt_t *rbt, uint8_t index) {
isc_mem_cput(rbt->mctx, rbt->hashtable[index],
ISC_HASHSIZE(rbt->hashbits[index]),
sizeof(dns_rbtnode_t *));
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
rbt->hashbits[index] = 0U;
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
rbt->hashtable[index] = NULL;
}
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
static uint32_t
rehash_bits(dns_rbt_t *rbt, size_t newcount) {
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
uint32_t newbits = rbt->hashbits[rbt->hindex];
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
while (newcount >= ISC_HASHSIZE(newbits) && newbits < ISC_HASH_MAX_BITS)
{
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
newbits += 1;
}
return (newbits);
}
/*
* Rebuild the hashtable to reduce the load factor
*/
static void
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_rehash(dns_rbt_t *rbt, uint32_t newbits) {
uint8_t oldindex = rbt->hindex;
uint32_t oldbits = rbt->hashbits[oldindex];
uint8_t newindex = RBT_HASH_NEXTTABLE(oldindex);
REQUIRE(rbt->hashbits[oldindex] >= ISC_HASH_MIN_BITS);
REQUIRE(rbt->hashbits[oldindex] <= ISC_HASH_MAX_BITS);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
REQUIRE(rbt->hashtable[oldindex] != NULL);
REQUIRE(newbits <= ISC_HASH_MAX_BITS);
REQUIRE(rbt->hashbits[newindex] == 0U);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
REQUIRE(rbt->hashtable[newindex] == NULL);
REQUIRE(newbits > oldbits);
hashtable_new(rbt, newindex, newbits);
rbt->hindex = newindex;
hashtable_rehash_one(rbt);
}
static void
hashtable_rehash_one(dns_rbt_t *rbt) {
dns_rbtnode_t **newtable = rbt->hashtable[rbt->hindex];
uint32_t oldsize =
ISC_HASHSIZE(rbt->hashbits[RBT_HASH_NEXTTABLE(rbt->hindex)]);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
dns_rbtnode_t **oldtable =
rbt->hashtable[RBT_HASH_NEXTTABLE(rbt->hindex)];
dns_rbtnode_t *node = NULL;
dns_rbtnode_t *nextnode;
/* Find first non-empty node */
while (rbt->hiter < oldsize && oldtable[rbt->hiter] == NULL) {
rbt->hiter++;
}
/* Rehashing complete */
if (rbt->hiter == oldsize) {
hashtable_free(rbt, RBT_HASH_NEXTTABLE(rbt->hindex));
rbt->hiter = 0;
return;
}
/* Move the first non-empty node from old hashtable to new hashtable */
for (node = oldtable[rbt->hiter]; node != NULL; node = nextnode) {
uint32_t hash = isc_hash_bits32(node->hashval,
rbt->hashbits[rbt->hindex]);
nextnode = node->hashnext;
node->hashnext = newtable[hash];
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
newtable[hash] = node;
2008-01-22 23:28:04 +00:00
}
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
oldtable[rbt->hiter] = NULL;
rbt->hiter++;
}
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
static void
maybe_rehash(dns_rbt_t *rbt, size_t newcount) {
uint32_t newbits = rehash_bits(rbt, newcount);
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
if (rbt->hashbits[rbt->hindex] < newbits &&
2022-11-02 19:33:14 +01:00
newbits <= ISC_HASH_MAX_BITS)
{
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_rehash(rbt, newbits);
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
}
}
static bool
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
rehashing_in_progress(dns_rbt_t *rbt) {
return (rbt->hashtable[RBT_HASH_NEXTTABLE(rbt->hindex)] != NULL);
}
static bool
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hashtable_is_overcommited(dns_rbt_t *rbt) {
return (rbt->nodecount >= (ISC_HASHSIZE(rbt->hashbits[rbt->hindex]) *
ISC_HASH_OVERCOMMIT));
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
}
/*
* Add a node to the hash table. Rehash the hashtable if the node count
* rises above a critical level.
*/
static void
hash_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
if (rehashing_in_progress(rbt)) {
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
/* Rehash in progress */
hashtable_rehash_one(rbt);
} else if (hashtable_is_overcommited(rbt)) {
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
/* Rehash requested */
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
maybe_rehash(rbt, rbt->nodecount);
}
2008-01-22 23:28:04 +00:00
hash_add_node(rbt, node, name);
}
/*
* Remove a node from the hash table
*/
static void
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
unhash_node(dns_rbt_t *rbt, dns_rbtnode_t *dnode) {
uint32_t hash;
uint8_t hindex = rbt->hindex;
dns_rbtnode_t *hnode;
2008-01-22 23:28:04 +00:00
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
REQUIRE(DNS_RBTNODE_VALID(dnode));
/*
* The node could be either in:
* a) current table: no rehashing in progress, or
* b) current table: the node has been already moved, or
* c) other table: the node hasn't been moved yet.
*/
nexttable:
hash = isc_hash_bits32(dnode->hashval, rbt->hashbits[hindex]);
2008-01-22 23:28:04 +00:00
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
hnode = rbt->hashtable[hindex][hash];
2008-01-22 23:28:04 +00:00
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
if (hnode == dnode) {
rbt->hashtable[hindex][hash] = hnode->hashnext;
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
return;
2015-12-09 19:07:20 +05:30
} else {
for (; hnode != NULL; hnode = hnode->hashnext) {
if (hnode->hashnext == dnode) {
hnode->hashnext = dnode->hashnext;
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
return;
}
2008-01-22 23:28:04 +00:00
}
}
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
if (TRY_NEXTTABLE(hindex, rbt)) {
/* Rehashing in progress, delete from the other table */
hindex = RBT_HASH_NEXTTABLE(hindex);
goto nexttable;
}
/* We haven't found any matching node, this should not be possible. */
UNREACHABLE();
}
static void
rotate_left(dns_rbtnode_t *node, dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child;
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(rootp != NULL);
child = node->right;
2008-01-22 23:28:04 +00:00
INSIST(child != NULL);
node->right = child->left;
if (child->left != NULL) {
child->left->parent = node;
}
child->left = node;
child->parent = node->parent;
if (node->is_root) {
2008-01-22 23:28:04 +00:00
*rootp = child;
child->is_root = 1;
node->is_root = 0;
} else {
if (node->parent->left == node) {
node->parent->left = child;
2008-01-22 23:28:04 +00:00
} else {
node->parent->right = child;
}
2008-01-22 23:28:04 +00:00
}
node->parent = child;
}
static void
rotate_right(dns_rbtnode_t *node, dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child;
2008-01-22 23:28:04 +00:00
REQUIRE(DNS_RBTNODE_VALID(node));
REQUIRE(rootp != NULL);
child = node->left;
2008-01-22 23:28:04 +00:00
INSIST(child != NULL);
node->left = child->right;
if (child->right != NULL) {
child->right->parent = node;
}
child->right = node;
child->parent = node->parent;
if (node->is_root) {
2008-01-22 23:28:04 +00:00
*rootp = child;
child->is_root = 1;
node->is_root = 0;
} else {
if (node->parent->left == node) {
node->parent->left = child;
2008-01-22 23:28:04 +00:00
} else {
node->parent->right = child;
}
2008-01-22 23:28:04 +00:00
}
node->parent = child;
}
/*
* This is the real workhorse of the insertion code, because it does the
* true red/black tree on a single level.
*/
static void
addonlevel(dns_rbtnode_t *node, dns_rbtnode_t *current, int order,
dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child, *root, *parent, *grandparent;
dns_name_t add_name, current_name;
dns_offsets_t add_offsets, current_offsets;
REQUIRE(rootp != NULL);
REQUIRE(DNS_RBTNODE_VALID(node) && node->left == NULL &&
node->right == NULL);
2008-01-22 23:28:04 +00:00
REQUIRE(current != NULL);
root = *rootp;
if (root == NULL) {
/*
* First node of a level.
*/
node->color = BLACK;
2008-01-22 23:28:04 +00:00
node->is_root = 1;
node->parent = current;
2008-01-22 23:28:04 +00:00
*rootp = node;
return;
}
child = root;
POST(child);
2008-01-22 23:28:04 +00:00
dns_name_init(&add_name, add_offsets);
node_name(node, &add_name);
2008-01-22 23:28:04 +00:00
dns_name_init(&current_name, current_offsets);
node_name(current, &current_name);
2008-01-22 23:28:04 +00:00
if (order < 0) {
INSIST(current->left == NULL);
current->left = node;
2008-01-22 23:28:04 +00:00
} else {
INSIST(current->right == NULL);
current->right = node;
2008-01-22 23:28:04 +00:00
}
INSIST(node->parent == NULL);
node->parent = current;
2008-01-22 23:28:04 +00:00
node->color = RED;
2008-01-22 23:28:04 +00:00
while (node != root && IS_RED(node->parent)) {
2008-01-22 23:28:04 +00:00
/*
* XXXDCL could do away with separate parent and grandparent
* variables. They are vestiges of the days before parent
* pointers. However, they make the code a little clearer.
*/
parent = node->parent;
grandparent = parent->parent;
2008-01-22 23:28:04 +00:00
if (parent == grandparent->left) {
child = grandparent->right;
2008-01-22 23:28:04 +00:00
if (child != NULL && IS_RED(child)) {
parent->color = BLACK;
child->color = BLACK;
grandparent->color = RED;
2008-01-22 23:28:04 +00:00
node = grandparent;
} else {
if (node == parent->right) {
2008-01-22 23:28:04 +00:00
rotate_left(parent, &root);
node = parent;
parent = node->parent;
grandparent = parent->parent;
2008-01-22 23:28:04 +00:00
}
parent->color = BLACK;
grandparent->color = RED;
2008-01-22 23:28:04 +00:00
rotate_right(grandparent, &root);
}
} else {
child = grandparent->left;
2008-01-22 23:28:04 +00:00
if (child != NULL && IS_RED(child)) {
parent->color = BLACK;
child->color = BLACK;
grandparent->color = RED;
2008-01-22 23:28:04 +00:00
node = grandparent;
} else {
if (node == parent->left) {
2008-01-22 23:28:04 +00:00
rotate_right(parent, &root);
node = parent;
parent = node->parent;
grandparent = parent->parent;
2008-01-22 23:28:04 +00:00
}
parent->color = BLACK;
grandparent->color = RED;
2008-01-22 23:28:04 +00:00
rotate_left(grandparent, &root);
}
}
}
root->color = BLACK;
ENSURE(root->is_root);
2008-01-22 23:28:04 +00:00
*rootp = root;
return;
}
/*
* This is the real workhorse of the deletion code, because it does the
* true red/black tree on a single level.
*/
static void
deletefromlevel(dns_rbtnode_t *item, dns_rbtnode_t **rootp) {
2008-01-22 23:28:04 +00:00
dns_rbtnode_t *child, *sibling, *parent;
dns_rbtnode_t *successor;
REQUIRE(item != NULL);
2008-01-22 23:28:04 +00:00
/*
* Verify that the parent history is (apparently) correct.
*/
INSIST((item->is_root && *rootp == item) ||
(!item->is_root &&
(item->parent->left == item || item->parent->right == item)));
2008-01-22 23:28:04 +00:00
child = NULL;
if (item->left == NULL) {
if (item->right == NULL) {
if (item->is_root) {
2008-01-22 23:28:04 +00:00
/*
* This is the only item in the tree.
*/
*rootp = NULL;
return;
}
} else {
/*
* This node has one child, on the right.
*/
child = item->right;
}
} else if (item->right == NULL) {
2008-01-22 23:28:04 +00:00
/*
* This node has one child, on the left.
*/
child = item->left;
2008-01-22 23:28:04 +00:00
} else {
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
dns_rbtnode_t *saved_parent, *saved_right;
int saved_color;
2008-01-22 23:28:04 +00:00
/*
* This node has two children, so it cannot be directly
* deleted. Find its immediate in-order successor and
* move it to this location, then do the deletion at the
* old site of the successor.
*/
successor = item->right;
while (successor->left != NULL) {
successor = successor->left;
}
2008-01-22 23:28:04 +00:00
/*
* The successor cannot possibly have a left child;
* if there is any child, it is on the right.
*/
if (successor->right != NULL) {
child = successor->right;
}
2008-01-22 23:28:04 +00:00
/*
* Swap the two nodes; it would be simpler to just replace
* the value being deleted with that of the successor,
* but this rigamarole is done so the caller has complete
* control over the pointers (and memory allocation) of
* all of nodes. If just the key value were removed from
* the tree, the pointer to the node would be unchanged.
*/
/*
* First, put the successor in the tree location of the
* node to be deleted. Save its existing tree pointer
* information, which will be needed when linking up
* delete to the successor's old location.
*/
saved_parent = successor->parent;
saved_right = successor->right;
saved_color = successor->color;
2008-01-22 23:28:04 +00:00
if (item->is_root) {
2008-01-22 23:28:04 +00:00
*rootp = successor;
successor->is_root = true;
item->is_root = false;
} else if (item->parent->left == item) {
item->parent->left = successor;
2008-01-22 23:28:04 +00:00
} else {
item->parent->right = successor;
}
2008-01-22 23:28:04 +00:00
successor->parent = item->parent;
successor->left = item->left;
successor->right = item->right;
successor->color = item->color;
2008-01-22 23:28:04 +00:00
if (successor->left != NULL) {
successor->left->parent = successor;
}
if (successor->right != successor) {
successor->right->parent = successor;
}
2008-01-22 23:28:04 +00:00
/*
* Now relink the node to be deleted into the
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
* successor's previous tree location.
2008-01-22 23:28:04 +00:00
*/
INSIST(!item->is_root);
2008-01-22 23:28:04 +00:00
Remove the memmove call on dns_rbtnode_t structure that contains atomics Calling the plain memmove on the structure that contains atomic members triggers following TSAN warning (even when we don't really use the atomic members in the code): WARNING: ThreadSanitizer: data race Read of size 8 at 0x000000000001 by thread T1 (mutexes: write M1, write M2): #0 memmove <null> #1 memmove /usr/include/x86_64-linux-gnu/bits/string_fortified.h:40:10 #2 deletefromlevel lib/dns/rbt.c:2675:3 #3 dns_rbt_deletenode lib/dns/rbt.c:2143:2 #4 delete_node lib/dns/rbtdb.c #5 decrement_reference lib/dns/rbtdb.c:2202:4 #6 prune_tree lib/dns/rbtdb.c:2259:3 #7 dispatch lib/isc/task.c:1152:7 #8 run lib/isc/task.c:1344:2 Previous atomic write of size 8 at 0x000000000001 by thread T2 (mutexes: read M3): #0 __tsan_atomic64_fetch_sub <null> #1 decrement_reference lib/dns/rbtdb.c:2103:7 #2 detachnode lib/dns/rbtdb.c:5440:6 #3 dns_db_detachnode lib/dns/db.c:588:2 #4 qctx_clean lib/ns/query.c:5104:3 #5 ns_query_done lib/ns/query.c:10868:2 #6 query_sign_nodata lib/ns/query.c #7 query_nodata lib/ns/query.c:8438:11 #8 query_gotanswer lib/ns/query.c #9 query_lookup lib/ns/query.c:5624:10 #10 ns__query_start lib/ns/query.c:5500:10 #11 query_setup lib/ns/query.c:5224:11 #12 ns_query_start lib/ns/query.c:11357:8 #13 ns__client_request lib/ns/client.c:2166:3 #14 udp_recv_cb lib/isc/netmgr/udp.c:414:2 #15 uv__udp_recvmsg /home/ondrej/Projects/tsan/libuv/src/unix/udp.c #16 uv__udp_io /home/ondrej/Projects/tsan/libuv/src/unix/udp.c:180:5 #17 uv__io_poll /home/ondrej/Projects/tsan/libuv/src/unix/linux-core.c:461:11 #18 uv_run /home/ondrej/Projects/tsan/libuv/src/unix/core.c:385:5 #19 nm_thread lib/isc/netmgr/netmgr.c:500:11 Location is heap block of size 132 at 0x000000000030 allocated by thread T3: #0 malloc <null> #1 default_memalloc lib/isc/mem.c:713:8 #2 mem_get lib/isc/mem.c:622:8 #3 mem_allocateunlocked lib/isc/mem.c:1268:8 #4 isc___mem_allocate lib/isc/mem.c:1288:7 #5 isc__mem_allocate lib/isc/mem.c:2453:10 #6 isc___mem_get lib/isc/mem.c:1037:11 #7 isc__mem_get lib/isc/mem.c:2432:10 #8 create_node lib/dns/rbt.c:2239:9 #9 dns_rbt_addnode lib/dns/rbt.c:1435:12 #10 findnodeintree lib/dns/rbtdb.c:2895:12 #11 findnode lib/dns/rbtdb.c:2941:10 #12 dns_db_findnode lib/dns/db.c:439:11 #13 diff_apply lib/dns/diff.c:306:5 #14 dns_diff_apply lib/dns/diff.c:459:10 #15 do_one_tuple lib/ns/update.c:444:11 #16 update_one_rr lib/ns/update.c:495:10 #17 update_action lib/ns/update.c:3123:6 #18 dispatch lib/isc/task.c:1152:7 #19 run lib/isc/task.c:1344:2 Mutex M1 is already destroyed. Mutex M2 is already destroyed. Mutex M3 is already destroyed. Thread T1 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T2 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_nm_start lib/isc/netmgr/netmgr.c:223:3 #3 create_managers bin/named/main.c:909:15 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 Thread T3 (running) created by main thread at: #0 pthread_create <null> #1 isc_thread_create lib/isc/pthreads/thread.c:73:8 #2 isc_taskmgr_create lib/isc/task.c:1434:3 #3 create_managers bin/named/main.c:915:11 #4 setup bin/named/main.c:1223:11 #5 main bin/named/main.c:1523:2 SUMMARY: ThreadSanitizer: data race in memmove
2020-09-21 14:52:53 +10:00
if (saved_parent == item) {
2008-01-22 23:28:04 +00:00
/*
* Node being deleted was successor's parent.
*/
successor->right = item;
item->parent = successor;
2008-01-22 23:28:04 +00:00
} else {
saved_parent->left = item;
item->parent = saved_parent;
2008-01-22 23:28:04 +00:00
}
/*
* Original location of successor node has no left.
*/
item->left = NULL;
item->right = saved_right;
item->color = saved_color;
2008-01-22 23:28:04 +00:00
}
/*
* Remove the node by removing the links from its parent.
*/
if (!item->is_root) {
if (item->parent->left == item) {
item->parent->left = child;
2008-01-22 23:28:04 +00:00
} else {
item->parent->right = child;
}
2008-01-22 23:28:04 +00:00
if (child != NULL) {
child->parent = item->parent;
}
2008-01-22 23:28:04 +00:00
} else {
/*
* This is the root being deleted, and at this point
* it is known to have just one child.
*/
*rootp = child;
child->is_root = 1;
child->parent = item->parent;
2008-01-22 23:28:04 +00:00
}
/*
* Fix color violations.
*/
if (IS_BLACK(item)) {
parent = item->parent;
2008-01-22 23:28:04 +00:00
while (child != *rootp && IS_BLACK(child)) {
INSIST(child == NULL || !child->is_root);
2008-01-22 23:28:04 +00:00
if (parent->left == child) {
sibling = parent->right;
2008-01-22 23:28:04 +00:00
if (IS_RED(sibling)) {
sibling->color = BLACK;
parent->color = RED;
2008-01-22 23:28:04 +00:00
rotate_left(parent, rootp);
sibling = parent->right;
2008-01-22 23:28:04 +00:00
}
INSIST(sibling != NULL);
if (IS_BLACK(sibling->left) &&
2022-11-02 19:33:14 +01:00
IS_BLACK(sibling->right))
{
sibling->color = RED;
2008-01-22 23:28:04 +00:00
child = parent;
} else {
if (IS_BLACK(sibling->right)) {
sibling->left->color = BLACK;
sibling->color = RED;
2008-01-22 23:28:04 +00:00
rotate_right(sibling, rootp);
sibling = parent->right;
2008-01-22 23:28:04 +00:00
}
sibling->color = parent->color;
parent->color = BLACK;
INSIST(sibling->right != NULL);
sibling->right->color = BLACK;
2008-01-22 23:28:04 +00:00
rotate_left(parent, rootp);
child = *rootp;
}
} else {
/*
* Child is parent's right child.
2009-01-17 14:45:17 +00:00
* Everything is done the same as above,
2008-01-22 23:28:04 +00:00
* except mirrored.
*/
sibling = parent->left;
2008-01-22 23:28:04 +00:00
if (IS_RED(sibling)) {
sibling->color = BLACK;
parent->color = RED;
2008-01-22 23:28:04 +00:00
rotate_right(parent, rootp);
sibling = parent->left;
2008-01-22 23:28:04 +00:00
}
INSIST(sibling != NULL);
if (IS_BLACK(sibling->left) &&
2022-11-02 19:33:14 +01:00
IS_BLACK(sibling->right))
{
sibling->color = RED;
2008-01-22 23:28:04 +00:00
child = parent;
} else {
if (IS_BLACK(sibling->left)) {
sibling->right->color = BLACK;
sibling->color = RED;
2008-01-22 23:28:04 +00:00
rotate_left(sibling, rootp);
sibling = parent->left;
2008-01-22 23:28:04 +00:00
}
sibling->color = parent->color;
parent->color = BLACK;
INSIST(sibling->left != NULL);
sibling->left->color = BLACK;
2008-01-22 23:28:04 +00:00
rotate_right(parent, rootp);
child = *rootp;
}
}
parent = child->parent;
2008-01-22 23:28:04 +00:00
}
if (IS_RED(child)) {
child->color = BLACK;
}
2008-01-22 23:28:04 +00:00
}
}
static void
freenode(dns_rbt_t *rbt, dns_rbtnode_t **nodep) {
dns_rbtnode_t *node = *nodep;
*nodep = NULL;
2012-06-20 23:46:40 +00:00
isc_mem_put(rbt->mctx, node, NODE_SIZE(node));
2012-06-20 23:46:40 +00:00
rbt->nodecount--;
}
static void
deletetreeflat(dns_rbt_t *rbt, unsigned int quantum, bool unhash,
2015-12-09 19:07:20 +05:30
dns_rbtnode_t **nodep) {
dns_rbtnode_t *root = *nodep;
2008-01-22 23:28:04 +00:00
2015-12-09 19:07:20 +05:30
while (root != NULL) {
/*
* If there is a left, right or down node, walk into it
* and iterate.
*/
if (root->left != NULL) {
2015-12-09 19:07:20 +05:30
dns_rbtnode_t *node = root;
root = root->left;
node->left = NULL;
} else if (root->right != NULL) {
2015-12-09 19:07:20 +05:30
dns_rbtnode_t *node = root;
root = root->right;
node->right = NULL;
} else if (root->down != NULL) {
2015-12-09 19:07:20 +05:30
dns_rbtnode_t *node = root;
root = root->down;
node->down = NULL;
2015-12-09 19:07:20 +05:30
} else {
/*
* There are no left, right or down nodes, so we
* can free this one and go back to its parent.
*/
dns_rbtnode_t *node = root;
root = root->parent;
2008-01-22 23:28:04 +00:00
if (rbt->data_deleter != NULL && node->data != NULL) {
rbt->data_deleter(node->data, rbt->deleter_arg);
}
2015-12-09 19:07:20 +05:30
if (unhash) {
unhash_node(rbt, node);
}
2015-12-09 19:07:20 +05:30
/*
* Note: we don't call unhash_node() here as we
* are destroying the complete RBT tree.
*/
#if DNS_RBT_USEMAGIC
2015-12-09 19:07:20 +05:30
node->magic = 0;
#endif /* if DNS_RBT_USEMAGIC */
2015-12-09 19:07:20 +05:30
freenode(rbt, &node);
if (quantum != 0 && --quantum == 0) {
break;
}
2015-12-09 19:07:20 +05:30
}
2008-01-22 23:28:04 +00:00
}
2015-12-09 19:07:20 +05:30
*nodep = root;
}
static size_t
getheight_helper(dns_rbtnode_t *node) {
size_t dl, dr;
size_t this_height, down_height;
if (node == NULL) {
return (0);
}
dl = getheight_helper(node->left);
dr = getheight_helper(node->right);
this_height = ISC_MAX(dl + 1, dr + 1);
down_height = getheight_helper(node->down);
return (ISC_MAX(this_height, down_height));
}
size_t
dns__rbt_getheight(dns_rbt_t *rbt) {
return (getheight_helper(rbt->root));
}
static bool
check_properties_helper(dns_rbtnode_t *node) {
if (node == NULL) {
return (true);
}
if (IS_RED(node)) {
/* Root nodes must be BLACK. */
if (node->is_root) {
return (false);
}
/* Both children of RED nodes must be BLACK. */
if (IS_RED(node->left) || IS_RED(node->right)) {
return (false);
}
}
if ((node->down != NULL) && (!node->down->is_root)) {
return (false);
}
if (node->is_root) {
if ((node->parent != NULL) && (node->parent->down != node)) {
return (false);
}
if (get_upper_node(node) != node->parent) {
return (false);
}
}
/* If node is assigned to the down_ pointer of its parent, it is
* a subtree root and must have the flag set.
*/
if (((!node->parent) || (node->parent->down == node)) &&
2022-11-02 19:33:14 +01:00
(!node->is_root))
{
return (false);
}
/* Repeat tests with this node's children. */
return (check_properties_helper(node->left) &&
check_properties_helper(node->right) &&
check_properties_helper(node->down));
}
static bool
check_black_distance_helper(dns_rbtnode_t *node, size_t *distance) {
size_t dl, dr, dd;
if (node == NULL) {
*distance = 1;
return (true);
}
if (!check_black_distance_helper(node->left, &dl)) {
return (false);
}
if (!check_black_distance_helper(node->right, &dr)) {
return (false);
}
if (!check_black_distance_helper(node->down, &dd)) {
return (false);
}
/* Left and right side black node counts must match. */
if (dl != dr) {
return (false);
}
if (IS_BLACK(node)) {
dl++;
}
*distance = dl;
return (true);
}
bool
dns__rbt_checkproperties(dns_rbt_t *rbt) {
size_t dd;
if (!check_properties_helper(rbt->root)) {
return (false);
}
/* Path from a given node to all its leaves must contain the
* same number of BLACK child nodes. This is done separately
* here instead of inside check_properties_helper() as
* it would take (n log n) complexity otherwise.
*/
return (check_black_distance_helper(rbt->root, &dd));
}
static void
dns_rbt_indent(FILE *f, int depth) {
2008-01-22 23:28:04 +00:00
int i;
fprintf(f, "%4d ", depth);
2008-01-22 23:28:04 +00:00
for (i = 0; i < depth; i++) {
fprintf(f, "- ");
}
}
void
dns_rbt_printnodeinfo(dns_rbtnode_t *n, FILE *f) {
2019-08-08 13:52:44 +10:00
if (n == NULL) {
fprintf(f, "Null node\n");
return;
}
fprintf(f, "Node info for nodename: ");
printnodename(n, true, f);
fprintf(f, "\n");
2012-06-20 23:46:40 +00:00
fprintf(f, "n = %p\n", n);
2018-02-15 00:00:17 +11:00
fprintf(f, "node lock address = %u\n", n->locknum);
fprintf(f, "Parent: %p\n", n->parent);
fprintf(f, "Right: %p\n", n->right);
fprintf(f, "Left: %p\n", n->left);
fprintf(f, "Down: %p\n", n->down);
2019-08-08 13:52:44 +10:00
fprintf(f, "Data: %p\n", n->data);
}
static void
printnodename(dns_rbtnode_t *node, bool quoted, FILE *f) {
2008-01-22 23:28:04 +00:00
isc_region_t r;
dns_name_t name;
char buffer[DNS_NAME_FORMATSIZE];
dns_offsets_t offsets;
r.length = node->namelen;
2008-01-22 23:28:04 +00:00
r.base = NAME(node);
2008-01-22 23:28:04 +00:00
dns_name_init(&name, offsets);
dns_name_fromregion(&name, &r);
2008-01-22 23:28:04 +00:00
dns_name_format(&name, buffer, sizeof(buffer));
if (quoted) {
2014-05-30 09:41:33 +10:00
fprintf(f, "\"%s\"", buffer);
} else {
2014-05-30 09:41:33 +10:00
fprintf(f, "%s", buffer);
}
}
static void
print_text_helper(dns_rbtnode_t *root, dns_rbtnode_t *parent, int depth,
const char *direction, void (*data_printer)(FILE *, void *),
FILE *f) {
dns_rbt_indent(f, depth);
2008-01-22 23:28:04 +00:00
if (root != NULL) {
printnodename(root, true, f);
fprintf(f, " (%s, %s", direction,
root->color == RED ? "RED" : "BLACK");
2008-01-22 23:28:04 +00:00
if ((!root->is_root && root->parent != parent) ||
(root->is_root && depth > 0 && root->parent->down != root))
2008-01-22 23:28:04 +00:00
{
fprintf(f, " (BAD parent pointer! -> ");
if (root->parent != NULL) {
printnodename(root->parent, true, f);
2008-01-22 23:28:04 +00:00
} else {
fprintf(f, "NULL");
}
fprintf(f, ")");
2008-01-22 23:28:04 +00:00
}
fprintf(f, ")");
2012-06-20 23:46:40 +00:00
if (root->data != NULL && data_printer != NULL) {
fprintf(f, " data@%p: ", root->data);
data_printer(f, root->data);
}
fprintf(f, "\n");
2008-01-22 23:28:04 +00:00
depth++;
if (root->color == RED && IS_RED(root->left)) {
fprintf(f, "** Red/Red color violation on left\n");
2019-08-08 13:52:44 +10:00
}
print_text_helper(root->left, root, depth, "left", data_printer,
f);
2008-01-22 23:28:04 +00:00
if (root->color == RED && IS_RED(root->right)) {
fprintf(f, "** Red/Red color violation on right\n");
2019-08-08 13:52:44 +10:00
}
print_text_helper(root->right, root, depth, "right",
data_printer, f);
2008-01-22 23:28:04 +00:00
print_text_helper(root->down, NULL, depth, "down", data_printer,
f);
} else {
fprintf(f, "NULL (%s)\n", direction);
}
}
void
dns_rbt_printtext(dns_rbt_t *rbt, void (*data_printer)(FILE *, void *),
FILE *f) {
REQUIRE(VALID_RBT(rbt));
print_text_helper(rbt->root, NULL, 0, "root", data_printer, f);
}
static int
print_dot_helper(dns_rbtnode_t *node, unsigned int *nodecount,
bool show_pointers, FILE *f) {
unsigned int l, r, d;
if (node == NULL) {
return (0);
}
l = print_dot_helper(node->left, nodecount, show_pointers, f);
r = print_dot_helper(node->right, nodecount, show_pointers, f);
d = print_dot_helper(node->down, nodecount, show_pointers, f);
*nodecount += 1;
fprintf(f, "node%u[label = \"<f0> |<f1> ", *nodecount);
printnodename(node, false, f);
fprintf(f, "|<f2>");
if (show_pointers) {
fprintf(f, "|<f3> n=%p|<f4> p=%p", node, node->parent);
}
fprintf(f, "\"] [");
if (IS_RED(node)) {
fprintf(f, "color=red");
} else {
fprintf(f, "color=black");
}
/* XXXMUKS: verify that IS_ROOT() indicates subtree root and not
* forest root.
*/
if (node->is_root) {
fprintf(f, ",penwidth=3");
}
if (node->data == NULL) {
fprintf(f, ",style=filled,fillcolor=lightgrey");
}
fprintf(f, "];\n");
if (node->left != NULL) {
fprintf(f, "\"node%u\":f0 -> \"node%u\":f1;\n", *nodecount, l);
}
if (node->down != NULL) {
fprintf(f, "\"node%u\":f1 -> \"node%u\":f1 [penwidth=5];\n",
*nodecount, d);
}
if (node->right != NULL) {
fprintf(f, "\"node%u\":f2 -> \"node%u\":f1;\n", *nodecount, r);
}
return (*nodecount);
}
void
dns_rbt_printdot(dns_rbt_t *rbt, bool show_pointers, FILE *f) {
unsigned int nodecount = 0;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
fprintf(f, "digraph g {\n");
fprintf(f, "node [shape = record,height=.1];\n");
print_dot_helper(rbt->root, &nodecount, show_pointers, f);
fprintf(f, "}\n");
}
/*
* Chain Functions
*/
void
dns_rbtnodechain_init(dns_rbtnodechain_t *chain) {
REQUIRE(chain != NULL);
/*
* Initialize 'chain'.
*/
chain->end = NULL;
chain->level_count = 0;
chain->level_matches = 0;
memset(chain->levels, 0, sizeof(chain->levels));
1999-04-09 15:21:15 +00:00
chain->magic = CHAIN_MAGIC;
1999-04-09 15:21:15 +00:00
}
isc_result_t
dns_rbtnodechain_current(dns_rbtnodechain_t *chain, dns_name_t *name,
2008-01-22 23:28:04 +00:00
dns_name_t *origin, dns_rbtnode_t **node) {
isc_result_t result = ISC_R_SUCCESS;
REQUIRE(VALID_CHAIN(chain));
SET_IF_NOT_NULL(node, chain->end);
2008-01-22 23:28:04 +00:00
if (chain->end == NULL) {
return (ISC_R_NOTFOUND);
}
2008-01-22 23:28:04 +00:00
if (name != NULL) {
node_name(chain->end, name);
2008-01-22 23:28:04 +00:00
if (chain->level_count == 0) {
/*
* Names in the top level tree are all absolute.
* Always make 'name' relative.
*/
INSIST(dns_name_isabsolute(name));
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* This is cheaper than
* dns_name_getlabelsequence().
2008-01-22 23:28:04 +00:00
*/
name->labels--;
name->length--;
name->attributes.absolute = false;
2008-01-22 23:28:04 +00:00
}
}
if (origin != NULL) {
if (chain->level_count > 0) {
result = chain_name(chain, origin, false);
} else {
dns_name_copy(dns_rootname, origin);
}
2008-01-22 23:28:04 +00:00
}
return (result);
1999-04-09 15:21:15 +00:00
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_prev(dns_rbtnodechain_t *chain, dns_name_t *name,
2008-01-22 23:28:04 +00:00
dns_name_t *origin) {
dns_rbtnode_t *current, *previous, *predecessor;
isc_result_t result = ISC_R_SUCCESS;
bool new_origin = false;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
predecessor = NULL;
current = chain->end;
if (current->left != NULL) {
2008-01-22 23:28:04 +00:00
/*
* Moving left one then right as far as possible is the
* previous node, at least for this level.
*/
current = current->left;
2008-01-22 23:28:04 +00:00
while (current->right != NULL) {
current = current->right;
}
2008-01-22 23:28:04 +00:00
predecessor = current;
} else {
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* No left links, so move toward the root. If at any
* point on the way there the link from parent to child
* is a right link, then the parent is the previous
* node, at least for this level.
2008-01-22 23:28:04 +00:00
*/
while (!current->is_root) {
2008-01-22 23:28:04 +00:00
previous = current;
current = current->parent;
2008-01-22 23:28:04 +00:00
if (current->right == previous) {
2008-01-22 23:28:04 +00:00
predecessor = current;
break;
}
}
}
if (predecessor != NULL) {
/*
* Found a predecessor node in this level. It might not
* really be the predecessor, however.
*/
if (predecessor->down != NULL) {
2008-01-22 23:28:04 +00:00
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* The predecessor is really down at least one
* level. Go down and as far right as possible,
* and repeat as long as the rightmost node has
* a down pointer.
2008-01-22 23:28:04 +00:00
*/
do {
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* XXX DCL Need to do something about
* origins here. See whether to go down,
* and if so whether it is truly what
* Bob calls a new origin.
2008-01-22 23:28:04 +00:00
*/
ADD_LEVEL(chain, predecessor);
predecessor = predecessor->down;
2008-01-22 23:28:04 +00:00
/* XXX DCL duplicated from above; clever
* way to unduplicate? */
while (predecessor->right != NULL) {
predecessor = predecessor->right;
}
} while (predecessor->down != NULL);
2008-01-22 23:28:04 +00:00
/* XXX DCL probably needs work on the concept */
if (origin != NULL) {
new_origin = true;
}
2008-01-22 23:28:04 +00:00
}
} else if (chain->level_count > 0) {
/*
* Dang, didn't find a predecessor in this level.
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* Got to the root of this level without having
* traversed any right links. Ascend the tree one
* level; the node that points to this tree is the
* predecessor.
2008-01-22 23:28:04 +00:00
*/
INSIST(chain->level_count > 0 && current->is_root);
2008-01-22 23:28:04 +00:00
predecessor = chain->levels[--chain->level_count];
/* XXX DCL probably needs work on the concept */
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* Don't declare an origin change when the new origin is
* "." at the top level tree, because "." is declared as
* the origin for the second level tree.
2008-01-22 23:28:04 +00:00
*/
if (origin != NULL &&
2022-11-02 19:33:14 +01:00
(chain->level_count > 0 || predecessor->offsetlen > 1))
{
new_origin = true;
}
2008-01-22 23:28:04 +00:00
}
if (predecessor != NULL) {
chain->end = predecessor;
if (new_origin) {
result = dns_rbtnodechain_current(chain, name, origin,
NULL);
if (result == ISC_R_SUCCESS) {
result = DNS_R_NEWORIGIN;
}
2008-01-22 23:28:04 +00:00
} else {
result = dns_rbtnodechain_current(chain, name, NULL,
NULL);
}
2008-01-22 23:28:04 +00:00
} else {
result = ISC_R_NOMORE;
}
2008-01-22 23:28:04 +00:00
return (result);
1999-04-09 15:21:15 +00:00
}
isc_result_t
dns_rbtnodechain_down(dns_rbtnodechain_t *chain, dns_name_t *name,
dns_name_t *origin) {
dns_rbtnode_t *current, *successor;
isc_result_t result = ISC_R_SUCCESS;
bool new_origin = false;
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
successor = NULL;
current = chain->end;
if (current->down != NULL) {
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* Don't declare an origin change when the new origin is
* "." at the second level tree, because "." is already
* declared as the origin for the top level tree.
*/
if (chain->level_count > 0 || current->offsetlen > 1) {
new_origin = true;
}
ADD_LEVEL(chain, current);
current = current->down;
while (current->left != NULL) {
current = current->left;
}
successor = current;
}
if (successor != NULL) {
chain->end = successor;
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* It is not necessary to use dns_rbtnodechain_current
* like the other functions because this function will
* never find a node in the topmost level. This is
* because the root level will never be more than one
* name, and everything in the megatree is a successor
* to that node, down at the second level or below.
*/
if (name != NULL) {
node_name(chain->end, name);
}
if (new_origin) {
if (origin != NULL) {
result = chain_name(chain, origin, false);
}
if (result == ISC_R_SUCCESS) {
result = DNS_R_NEWORIGIN;
}
} else {
result = ISC_R_SUCCESS;
}
} else {
result = ISC_R_NOMORE;
}
return (result);
}
isc_result_t
dns_rbtnodechain_nextflat(dns_rbtnodechain_t *chain, dns_name_t *name) {
dns_rbtnode_t *current, *previous, *successor;
isc_result_t result = ISC_R_SUCCESS;
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
successor = NULL;
current = chain->end;
if (current->right == NULL) {
while (!current->is_root) {
previous = current;
current = current->parent;
if (current->left == previous) {
successor = current;
break;
}
}
} else {
current = current->right;
while (current->left != NULL) {
current = current->left;
}
successor = current;
}
if (successor != NULL) {
chain->end = successor;
if (name != NULL) {
node_name(chain->end, name);
}
result = ISC_R_SUCCESS;
} else {
result = ISC_R_NOMORE;
}
return (result);
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_next(dns_rbtnodechain_t *chain, dns_name_t *name,
2008-01-22 23:28:04 +00:00
dns_name_t *origin) {
dns_rbtnode_t *current, *previous, *successor;
isc_result_t result = ISC_R_SUCCESS;
bool new_origin = false;
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_CHAIN(chain) && chain->end != NULL);
successor = NULL;
current = chain->end;
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* If there is a level below this node, the next node is the
* leftmost node of the next level.
2008-01-22 23:28:04 +00:00
*/
if (current->down != NULL) {
2008-01-22 23:28:04 +00:00
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* Don't declare an origin change when the new origin is
* "." at the second level tree, because "." is already
* declared as the origin for the top level tree.
2008-01-22 23:28:04 +00:00
*/
if (chain->level_count > 0 || current->offsetlen > 1) {
new_origin = true;
}
2008-01-22 23:28:04 +00:00
ADD_LEVEL(chain, current);
current = current->down;
2008-01-22 23:28:04 +00:00
while (current->left != NULL) {
current = current->left;
}
2008-01-22 23:28:04 +00:00
successor = current;
} else if (current->right == NULL) {
2008-01-22 23:28:04 +00:00
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* The successor is up, either in this level or a
* previous one. Head back toward the root of the tree,
* looking for any path that was via a left link; the
* successor is the node that has that left link. In
* the event the root of the level is reached without
* having traversed any left links, ascend one level and
* look for either a right link off the point of ascent,
* or search for a left link upward again, repeating
2009-01-17 14:45:17 +00:00
* ascends until either case is true.
2008-01-22 23:28:04 +00:00
*/
do {
while (!current->is_root) {
2008-01-22 23:28:04 +00:00
previous = current;
current = current->parent;
2008-01-22 23:28:04 +00:00
if (current->left == previous) {
2008-01-22 23:28:04 +00:00
successor = current;
break;
}
}
if (successor == NULL) {
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* Reached the root without having
* traversed any left pointers, so this
* level is done.
2008-01-22 23:28:04 +00:00
*/
if (chain->level_count == 0) {
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* If the tree we are iterating
* over was modified since this
* chain was initialized in a
* way that caused node splits
* to occur, "current" may now
* be pointing to a root node
* which appears to be at level
* 0, but still has a parent. If
* that happens, abort.
* Otherwise, we are done
* looking for a successor as we
* really reached the root node
* on level 0.
*/
INSIST(current->parent == NULL);
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
current = chain->levels[--chain->level_count];
new_origin = true;
2008-01-22 23:28:04 +00:00
if (current->right != NULL) {
2008-01-22 23:28:04 +00:00
break;
}
2008-01-22 23:28:04 +00:00
}
} while (successor == NULL);
}
if (successor == NULL && current->right != NULL) {
current = current->right;
2008-01-22 23:28:04 +00:00
while (current->left != NULL) {
current = current->left;
}
2008-01-22 23:28:04 +00:00
successor = current;
}
if (successor != NULL) {
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* If we determine that the current node is the
* successor to itself, we will run into an infinite
* loop, so abort instead.
*/
INSIST(chain->end != successor);
2008-01-22 23:28:04 +00:00
chain->end = successor;
/*
dns/rbt.c: Implement incremental hash table resizing Originally, the hash table used in RBT database would be resized when it reached certain number of elements (defined by overcommit). This was causing resolution brownouts for busy resolvers, because the rehashing could take several seconds to complete. This was mitigated by pre-allocating the hash table in the RBT database used for caching to be large-enough as determined by max-cache-size. The downside of this solution was that the pre-allocated hash table could take a significant chunk of the memory even when the resolver cache would be otherwise empty because the default value for max-cache-size is 90% of available memory. Implement incremental resizing[1] to perform the rehashing gradually: 1. During the resize, allocate the new hash table, but keep the old table unchanged. 2. In each lookup or delete operation, check both tables. 3. Perform insertion operations only in the new table. 4. At each insertion also move r elements from the old table to the new table. 5. When all elements are removed from the old table, deallocate it. To ensure that the old table is completely copied over before the new table itself needs to be enlarged, it is necessary to increase the size of the table by a factor of at least (r + 1)/r during resizing. In our implementation r is equal to 1. The downside of this approach is that the old table and the new table could stay in memory for longer when there are no new insertions into the hash table for prolonged periods of time as the incremental rehashing happens only during the insertions. The upside of this approach is that it's no longer necessary to pre-allocate large hash table, because the RBT hash table rehashing doesn't cause resolution brownouts anymore and thus we can use the memory as needed. 1. https://en.m.wikipedia.org/wiki/Hash_table#Dynamic_resizing
2021-10-07 18:41:02 +02:00
* It is not necessary to use dns_rbtnodechain_current
* like the other functions because this function will
* never find a node in the topmost level. This is
* because the root level will never be more than one
* name, and everything in the megatree is a successor
* to that node, down at the second level or below.
2008-01-22 23:28:04 +00:00
*/
if (name != NULL) {
node_name(chain->end, name);
}
2008-01-22 23:28:04 +00:00
if (new_origin) {
if (origin != NULL) {
result = chain_name(chain, origin, false);
}
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
result = DNS_R_NEWORIGIN;
}
2008-01-22 23:28:04 +00:00
} else {
result = ISC_R_SUCCESS;
}
2008-01-22 23:28:04 +00:00
} else {
result = ISC_R_NOMORE;
}
2008-01-22 23:28:04 +00:00
return (result);
1999-04-09 15:21:15 +00:00
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_first(dns_rbtnodechain_t *chain, dns_rbt_t *rbt,
2008-01-22 23:28:04 +00:00
dns_name_t *name, dns_name_t *origin)
1999-04-09 15:21:15 +00:00
{
2008-01-22 23:28:04 +00:00
isc_result_t result;
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(VALID_CHAIN(chain));
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_reset(chain);
2008-01-22 23:28:04 +00:00
chain->end = rbt->root;
2008-01-22 23:28:04 +00:00
result = dns_rbtnodechain_current(chain, name, origin, NULL);
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
result = DNS_R_NEWORIGIN;
}
2008-01-22 23:28:04 +00:00
return (result);
}
isc_result_t
1999-04-09 15:21:15 +00:00
dns_rbtnodechain_last(dns_rbtnodechain_t *chain, dns_rbt_t *rbt,
2008-01-22 23:28:04 +00:00
dns_name_t *name, dns_name_t *origin)
1999-04-09 15:21:15 +00:00
{
2008-01-22 23:28:04 +00:00
isc_result_t result;
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
REQUIRE(VALID_RBT(rbt));
REQUIRE(VALID_CHAIN(chain));
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_reset(chain);
2008-01-22 23:28:04 +00:00
result = move_chain_to_last(chain, rbt->root);
if (result != ISC_R_SUCCESS) {
return (result);
}
2008-01-22 23:28:04 +00:00
result = dns_rbtnodechain_current(chain, name, origin, NULL);
2008-01-22 23:28:04 +00:00
if (result == ISC_R_SUCCESS) {
result = DNS_R_NEWORIGIN;
}
2008-01-22 23:28:04 +00:00
return (result);
1999-04-09 15:21:15 +00:00
}
void
dns_rbtnodechain_reset(dns_rbtnodechain_t *chain) {
REQUIRE(VALID_CHAIN(chain));
2008-01-22 23:28:04 +00:00
/*
* Free any dynamic storage associated with 'chain', and then
* reinitialize 'chain'.
*/
chain->end = NULL;
chain->level_count = 0;
chain->level_matches = 0;
}
1999-04-09 15:21:15 +00:00
void
dns_rbtnodechain_invalidate(dns_rbtnodechain_t *chain) {
2008-01-22 23:28:04 +00:00
/*
* Free any dynamic storage associated with 'chain', and then
* invalidate 'chain'.
*/
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
dns_rbtnodechain_reset(chain);
1999-04-09 15:21:15 +00:00
2008-01-22 23:28:04 +00:00
chain->magic = 0;
1999-04-09 15:21:15 +00:00
}
/* XXXMUKS:
*
* - worth removing inline as static functions are inlined automatically
* where suitable by modern compilers.
* - bump the size of dns_rbt.nodecount to size_t.
* - the dumpfile header also contains a nodecount that is unsigned
* int. If large files (> 2^32 nodes) are to be supported, the
* allocation for this field should be increased.
*/