2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-08-22 18:19:42 +00:00
bind/lib/dns/badcache.c

520 lines
13 KiB
C
Raw Normal View History

/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
/*! \file */
#include <inttypes.h>
#include <stdbool.h>
#include <isc/buffer.h>
#include <isc/hash.h>
#include <isc/log.h>
#include <isc/mem.h>
#include <isc/mutex.h>
#include <isc/print.h>
#include <isc/rwlock.h>
#include <isc/string.h>
#include <isc/time.h>
#include <isc/util.h>
#include <dns/badcache.h>
#include <dns/fixedname.h>
#include <dns/name.h>
#include <dns/rdatatype.h>
#include <dns/types.h>
typedef struct dns_bcentry dns_bcentry_t;
struct dns_badcache {
unsigned int magic;
isc_rwlock_t lock;
2020-02-13 14:44:37 -08:00
isc_mem_t *mctx;
isc_mutex_t *tlocks;
dns_bcentry_t **table;
atomic_uint_fast32_t count;
atomic_uint_fast32_t sweep;
2020-02-13 14:44:37 -08:00
unsigned int minsize;
unsigned int size;
};
2020-02-13 14:44:37 -08:00
#define BADCACHE_MAGIC ISC_MAGIC('B', 'd', 'C', 'a')
#define VALID_BADCACHE(m) ISC_MAGIC_VALID(m, BADCACHE_MAGIC)
struct dns_bcentry {
2020-02-13 14:44:37 -08:00
dns_bcentry_t *next;
dns_rdatatype_t type;
2020-02-13 14:44:37 -08:00
isc_time_t expire;
uint32_t flags;
unsigned int hashval;
dns_fixedname_t fname;
dns_name_t *name;
};
static void
badcache_resize(dns_badcache_t *bc, isc_time_t *now);
isc_result_t
2020-02-13 14:44:37 -08:00
dns_badcache_init(isc_mem_t *mctx, unsigned int size, dns_badcache_t **bcp) {
dns_badcache_t *bc = NULL;
unsigned int i;
REQUIRE(bcp != NULL && *bcp == NULL);
REQUIRE(mctx != NULL);
bc = isc_mem_get(mctx, sizeof(dns_badcache_t));
memset(bc, 0, sizeof(dns_badcache_t));
isc_mem_attach(mctx, &bc->mctx);
isc_rwlock_init(&bc->lock, 0, 0);
bc->table = isc_mem_get(bc->mctx, sizeof(*bc->table) * size);
bc->tlocks = isc_mem_get(bc->mctx, sizeof(isc_mutex_t) * size);
for (i = 0; i < size; i++) {
isc_mutex_init(&bc->tlocks[i]);
}
bc->size = bc->minsize = size;
2014-09-06 09:38:48 +10:00
memset(bc->table, 0, bc->size * sizeof(dns_bcentry_t *));
atomic_init(&bc->count, 0);
atomic_init(&bc->sweep, 0);
bc->magic = BADCACHE_MAGIC;
*bcp = bc;
return (ISC_R_SUCCESS);
}
void
2020-02-13 14:44:37 -08:00
dns_badcache_destroy(dns_badcache_t **bcp) {
dns_badcache_t *bc;
unsigned int i;
REQUIRE(bcp != NULL && *bcp != NULL);
bc = *bcp;
*bcp = NULL;
dns_badcache_flush(bc);
bc->magic = 0;
isc_rwlock_destroy(&bc->lock);
for (i = 0; i < bc->size; i++) {
isc_mutex_destroy(&bc->tlocks[i]);
}
isc_mem_put(bc->mctx, bc->table, sizeof(dns_bcentry_t *) * bc->size);
isc_mem_put(bc->mctx, bc->tlocks, sizeof(isc_mutex_t) * bc->size);
isc_mem_putanddetach(&bc->mctx, bc, sizeof(dns_badcache_t));
}
static void
badcache_resize(dns_badcache_t *bc, isc_time_t *now) {
dns_bcentry_t **newtable, *bad, *next;
isc_mutex_t *newlocks;
2020-02-13 14:44:37 -08:00
unsigned int newsize, i;
bool grow;
RWLOCK(&bc->lock, isc_rwlocktype_write);
/*
* XXXWPK we will have a thundering herd problem here,
* as all threads will wait on the RWLOCK when there's
* a need to resize badcache.
* However, it happens so rarely it should not be a
* performance issue. This is because we double the
* size every time we grow it, and we don't shrink
* unless the number of entries really shrunk. In a
* high load situation, the number of badcache entries
* will eventually stabilize.
*/
if (atomic_load_relaxed(&bc->count) > bc->size * 8) {
grow = true;
} else if (atomic_load_relaxed(&bc->count) < bc->size * 2 &&
bc->size > bc->minsize)
{
grow = false;
} else {
/* Someone resized it already, bail. */
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
return;
}
if (grow) {
newsize = bc->size * 2 + 1;
} else {
newsize = (bc->size - 1) / 2;
#ifdef __clang_analyzer__
/*
* XXXWPK there's a bug in clang static analyzer -
* `value % newsize` is considered undefined even though
* we check if newsize is larger than 0. This helps.
*/
newsize += 1;
#endif
}
RUNTIME_CHECK(newsize > 0);
newtable = isc_mem_get(bc->mctx, sizeof(dns_bcentry_t *) * newsize);
memset(newtable, 0, sizeof(dns_bcentry_t *) * newsize);
newlocks = isc_mem_get(bc->mctx, sizeof(isc_mutex_t) * newsize);
/* Copy existing mutexes */
for (i = 0; i < newsize && i < bc->size; i++) {
newlocks[i] = bc->tlocks[i];
}
/* Initialize additional mutexes if we're growing */
for (i = bc->size; i < newsize; i++) {
isc_mutex_init(&newlocks[i]);
}
/* Destroy extra mutexes if we're shrinking */
for (i = newsize; i < bc->size; i++) {
isc_mutex_destroy(&bc->tlocks[i]);
}
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
for (bad = bc->table[i]; bad != NULL; bad = next) {
next = bad->next;
if (isc_time_compare(&bad->expire, now) < 0) {
isc_mem_put(bc->mctx, bad, sizeof(*bad));
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
bad->next = newtable[bad->hashval % newsize];
newtable[bad->hashval % newsize] = bad;
}
}
bc->table[i] = NULL;
}
isc_mem_put(bc->mctx, bc->tlocks, sizeof(isc_mutex_t) * bc->size);
bc->tlocks = newlocks;
isc_mem_put(bc->mctx, bc->table, sizeof(*bc->table) * bc->size);
bc->size = newsize;
bc->table = newtable;
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}
void
dns_badcache_add(dns_badcache_t *bc, const dns_name_t *name,
dns_rdatatype_t type, bool update, uint32_t flags,
2020-02-13 14:44:37 -08:00
isc_time_t *expire) {
isc_result_t result;
unsigned int hashval, hash;
dns_bcentry_t *bad, *prev, *next;
2020-02-13 14:44:37 -08:00
isc_time_t now;
bool resize = false;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
REQUIRE(expire != NULL);
RWLOCK(&bc->lock, isc_rwlocktype_read);
result = isc_time_now(&now);
if (result != ISC_R_SUCCESS) {
isc_time_settoepoch(&now);
}
hashval = dns_name_hash(name, false);
hash = hashval % bc->size;
LOCK(&bc->tlocks[hash]);
prev = NULL;
for (bad = bc->table[hash]; bad != NULL; bad = next) {
next = bad->next;
if (bad->type == type && dns_name_equal(name, bad->name)) {
if (update) {
bad->expire = *expire;
bad->flags = flags;
}
break;
}
if (isc_time_compare(&bad->expire, &now) < 0) {
if (prev == NULL) {
bc->table[hash] = bad->next;
} else {
prev->next = bad->next;
}
isc_mem_put(bc->mctx, bad, sizeof(*bad));
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
prev = bad;
}
}
if (bad == NULL) {
unsigned count;
isc_buffer_t buffer;
bad = isc_mem_get(bc->mctx, sizeof(*bad));
*bad = (dns_bcentry_t){ .type = type,
.hashval = hashval,
.expire = *expire,
.flags = flags,
.next = bc->table[hash] };
isc_buffer_init(&buffer, bad + 1, name->length);
bad->name = dns_fixedname_initname(&bad->fname);
dns_name_copy(name, bad->name);
bc->table[hash] = bad;
count = atomic_fetch_add_relaxed(&bc->count, 1);
if ((count > bc->size * 8) ||
(count < bc->size * 2 && bc->size > bc->minsize)) {
resize = true;
}
} else {
bad->expire = *expire;
}
UNLOCK(&bc->tlocks[hash]);
RWUNLOCK(&bc->lock, isc_rwlocktype_read);
if (resize) {
badcache_resize(bc, &now);
}
}
bool
dns_badcache_find(dns_badcache_t *bc, const dns_name_t *name,
2020-02-13 14:44:37 -08:00
dns_rdatatype_t type, uint32_t *flagp, isc_time_t *now) {
dns_bcentry_t *bad, *prev, *next;
2020-02-13 14:44:37 -08:00
bool answer = false;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
unsigned int i;
unsigned int hash;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
REQUIRE(now != NULL);
RWLOCK(&bc->lock, isc_rwlocktype_read);
2016-03-04 11:12:23 +05:30
/*
* XXXMUKS: dns_name_equal() is expensive as it does a
* octet-by-octet comparison, and it can be made better in two
* ways here. First, lowercase the names (use
* dns_name_downcase() instead of dns_name_copy() in
* dns_badcache_add()) so that dns_name_caseequal() can be used
* which the compiler will emit as SIMD instructions. Second,
* don't put multiple copies of the same name in the chain (or
* multiple names will have to be matched for equality), but use
* name->link to store the type specific part.
*/
if (atomic_load_relaxed(&bc->count) == 0) {
goto skip;
}
hash = dns_name_hash(name, false) % bc->size;
prev = NULL;
LOCK(&bc->tlocks[hash]);
for (bad = bc->table[hash]; bad != NULL; bad = next) {
next = bad->next;
/*
* Search the hash list. Clean out expired records as we go.
*/
if (isc_time_compare(&bad->expire, now) < 0) {
if (prev != NULL) {
prev->next = bad->next;
} else {
bc->table[hash] = bad->next;
}
isc_mem_put(bc->mctx, bad, sizeof(*bad));
atomic_fetch_sub(&bc->count, 1);
continue;
}
if (bad->type == type && dns_name_equal(name, bad->name)) {
if (flagp != NULL) {
*flagp = bad->flags;
}
answer = true;
break;
}
prev = bad;
}
UNLOCK(&bc->tlocks[hash]);
skip:
/*
* Slow sweep to clean out stale records.
*/
i = atomic_fetch_add(&bc->sweep, 1) % bc->size;
if (isc_mutex_trylock(&bc->tlocks[i]) == ISC_R_SUCCESS) {
bad = bc->table[i];
if (bad != NULL && isc_time_compare(&bad->expire, now) < 0) {
bc->table[i] = bad->next;
isc_mem_put(bc->mctx, bad, sizeof(*bad));
atomic_fetch_sub_relaxed(&bc->count, 1);
}
UNLOCK(&bc->tlocks[i]);
}
RWUNLOCK(&bc->lock, isc_rwlocktype_read);
return (answer);
}
void
2020-02-13 14:44:37 -08:00
dns_badcache_flush(dns_badcache_t *bc) {
dns_bcentry_t *entry, *next;
2020-02-13 14:44:37 -08:00
unsigned int i;
RWLOCK(&bc->lock, isc_rwlocktype_write);
REQUIRE(VALID_BADCACHE(bc));
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
for (entry = bc->table[i]; entry != NULL; entry = next) {
next = entry->next;
isc_mem_put(bc->mctx, entry, sizeof(*entry));
atomic_fetch_sub_relaxed(&bc->count, 1);
}
bc->table[i] = NULL;
}
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}
void
2020-02-13 14:44:37 -08:00
dns_badcache_flushname(dns_badcache_t *bc, const dns_name_t *name) {
dns_bcentry_t *bad, *prev, *next;
2020-02-13 14:44:37 -08:00
isc_result_t result;
isc_time_t now;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
unsigned int hash;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
RWLOCK(&bc->lock, isc_rwlocktype_read);
result = isc_time_now(&now);
if (result != ISC_R_SUCCESS) {
isc_time_settoepoch(&now);
}
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
hash = dns_name_hash(name, false) % bc->size;
LOCK(&bc->tlocks[hash]);
prev = NULL;
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
for (bad = bc->table[hash]; bad != NULL; bad = next) {
int n;
next = bad->next;
n = isc_time_compare(&bad->expire, &now);
if (n < 0 || dns_name_equal(name, bad->name)) {
if (prev == NULL) {
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
bc->table[hash] = bad->next;
} else {
prev->next = bad->next;
}
isc_mem_put(bc->mctx, bad, sizeof(*bad));
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
prev = bad;
}
}
Fix the rbt hashtable and grow it when setting max-cache-size There were several problems with rbt hashtable implementation: 1. Our internal hashing function returns uint64_t value, but it was silently truncated to unsigned int in dns_name_hash() and dns_name_fullhash() functions. As the SipHash 2-4 higher bits are more random, we need to use the upper half of the return value. 2. The hashtable implementation in rbt.c was using modulo to pick the slot number for the hash table. This has several problems because modulo is: a) slow, b) oblivious to patterns in the input data. This could lead to very uneven distribution of the hashed data in the hashtable. Combined with the single-linked lists we use, it could really hog-down the lookup and removal of the nodes from the rbt tree[a]. The Fibonacci Hashing is much better fit for the hashtable function here. For longer description, read "Fibonacci Hashing: The Optimization that the World Forgot"[b] or just look at the Linux kernel. Also this will make Diego very happy :). 3. The hashtable would rehash every time the number of nodes in the rbt tree would exceed 3 * (hashtable size). The overcommit will make the uneven distribution in the hashtable even worse, but the main problem lies in the rehashing - every time the database grows beyond the limit, each subsequent rehashing will be much slower. The mitigation here is letting the rbt know how big the cache can grown and pre-allocate the hashtable to be big enough to actually never need to rehash. This will consume more memory at the start, but since the size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it will only consume maximum of 32GB of memory for hashtable in the worst case (and max-cache-size would need to be set to more than 4TB). Calling the dns_db_adjusthashsize() will also cap the maximum size of the hashtable to the pre-computed number of bits, so it won't try to consume more gigabytes of memory than available for the database. FIXME: What is the average size of the rbt node that gets hashed? I chose the pagesize (4k) as initial value to precompute the size of the hashtable, but the value is based on feeling and not any real data. For future work, there are more places where we use result of the hash value modulo some small number and that would benefit from Fibonacci Hashing to get better distribution. Notes: a. A doubly linked list should be used here to speedup the removal of the entries from the hashtable. b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
2020-07-16 10:29:54 +02:00
UNLOCK(&bc->tlocks[hash]);
RWUNLOCK(&bc->lock, isc_rwlocktype_read);
}
void
2020-02-13 14:44:37 -08:00
dns_badcache_flushtree(dns_badcache_t *bc, const dns_name_t *name) {
dns_bcentry_t *bad, *prev, *next;
2020-02-13 14:44:37 -08:00
unsigned int i;
int n;
isc_time_t now;
isc_result_t result;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
/*
* We write lock the tree to avoid relocking every node
* individually.
*/
RWLOCK(&bc->lock, isc_rwlocktype_write);
result = isc_time_now(&now);
if (result != ISC_R_SUCCESS) {
isc_time_settoepoch(&now);
}
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
prev = NULL;
for (bad = bc->table[i]; bad != NULL; bad = next) {
next = bad->next;
n = isc_time_compare(&bad->expire, &now);
if (n < 0 || dns_name_issubdomain(bad->name, name)) {
if (prev == NULL) {
bc->table[i] = bad->next;
} else {
prev->next = bad->next;
}
isc_mem_put(bc->mctx, bad, sizeof(*bad));
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
prev = bad;
}
}
}
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}
void
2020-02-13 14:44:37 -08:00
dns_badcache_print(dns_badcache_t *bc, const char *cachename, FILE *fp) {
char namebuf[DNS_NAME_FORMATSIZE];
char typebuf[DNS_RDATATYPE_FORMATSIZE];
dns_bcentry_t *bad, *next, *prev;
2020-02-13 14:44:37 -08:00
isc_time_t now;
unsigned int i;
uint64_t t;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(cachename != NULL);
REQUIRE(fp != NULL);
/*
* We write lock the tree to avoid relocking every node
* individually.
*/
RWLOCK(&bc->lock, isc_rwlocktype_write);
fprintf(fp, ";\n; %s\n;\n", cachename);
TIME_NOW(&now);
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
prev = NULL;
for (bad = bc->table[i]; bad != NULL; bad = next) {
next = bad->next;
if (isc_time_compare(&bad->expire, &now) < 0) {
if (prev != NULL) {
prev->next = bad->next;
} else {
bc->table[i] = bad->next;
}
isc_mem_put(bc->mctx, bad, sizeof(*bad));
atomic_fetch_sub_relaxed(&bc->count, 1);
continue;
}
prev = bad;
dns_name_format(bad->name, namebuf, sizeof(namebuf));
dns_rdatatype_format(bad->type, typebuf,
sizeof(typebuf));
t = isc_time_microdiff(&bad->expire, &now);
t /= 1000;
fprintf(fp,
"; %s/%s [ttl "
"%" PRIu64 "]\n",
namebuf, typebuf, t);
}
}
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}