diff --git a/CHANGES b/CHANGES index 548208b3ef..2df84ce57a 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,5 @@ -6319. [placeholder] +6319. [func] Limit isc_async_run() overhead for RBTDB tree pruning. + [GL #4383] 6318. [placeholder] diff --git a/lib/dns/include/dns/rbt.h b/lib/dns/include/dns/rbt.h index 93e429069b..ca974e5873 100644 --- a/lib/dns/include/dns/rbt.h +++ b/lib/dns/include/dns/rbt.h @@ -118,6 +118,12 @@ struct dns_rbtnode { */ ISC_LINK(dns_rbtnode_t) deadlink; + /*% + * This linked list is used to store nodes from which tree pruning can + * be started. + */ + ISC_LINK(dns_rbtnode_t) prunelink; + /*@{*/ /*! * These values are used in the RBT DB implementation. The appropriate diff --git a/lib/dns/rbt.c b/lib/dns/rbt.c index 54ddfd9175..34ec175bc3 100644 --- a/lib/dns/rbt.c +++ b/lib/dns/rbt.c @@ -1520,6 +1520,7 @@ create_node(isc_mem_t *mctx, const dns_name_t *name, dns_rbtnode_t **nodep) { }; ISC_LINK_INIT(node, deadlink); + ISC_LINK_INIT(node, prunelink); isc_refcount_init(&node->references, 0); diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index 900b580058..3761cef590 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -439,6 +439,7 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) { unsigned int i; isc_result_t result; char buf[DNS_NAME_FORMATSIZE]; + dns_rbtnode_t *node = NULL; dns_rbt_t **treep = NULL; isc_time_t start; @@ -461,8 +462,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) { * the overhead of unlinking all nodes here should be negligible. */ for (i = 0; i < rbtdb->node_lock_count; i++) { - dns_rbtnode_t *node = NULL; - node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); while (node != NULL) { ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink); @@ -470,6 +469,12 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log) { } } + node = ISC_LIST_HEAD(rbtdb->prunenodes); + while (node != NULL) { + ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink); + node = ISC_LIST_HEAD(rbtdb->prunenodes); + } + rbtdb->quantum = (rbtdb->loop != NULL) ? 100 : 0; for (;;) { @@ -1148,16 +1153,26 @@ is_leaf(dns_rbtnode_t *node) { node->left == NULL && node->right == NULL); } +/*% + * The tree lock must be held when this function is called as it reads and + * updates rbtdb->prunenodes. + */ static void send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, isc_rwlocktype_t locktype DNS__DB_FLARG) { - prune_t *prune = isc_mem_get(rbtdb->common.mctx, sizeof(*prune)); - *prune = (prune_t){ .node = node }; + bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL); + + INSIST(locktype == isc_rwlocktype_write); - dns_db_attach((dns_db_t *)rbtdb, &prune->db); dns__rbtdb_newref(rbtdb, node, locktype DNS__DB_FLARG_PASS); + INSIST(!ISC_LINK_LINKED(node, prunelink)); + ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink); - isc_async_run(rbtdb->loop, prune_tree, prune); + if (!pruning_queued) { + dns_db_t *db = NULL; + dns_db_attach((dns_db_t *)rbtdb, &db); + isc_async_run(rbtdb->loop, prune_tree, db); + } } /*% @@ -1455,64 +1470,83 @@ restore_locks: } /* - * Prune the tree by recursively cleaning-up single leaves. In the worst - * case, the number of iteration is the number of tree levels, which is at - * most the maximum number of domain name labels, i.e, 127. In practice, this - * should be much smaller (only a few times), and even the worst case would be - * acceptable for a single event. + * Prune the tree by recursively cleaning up single leaves. Go through all + * nodes stored in the rbtdb->prunenodes list; for each of them, in the worst + * case, it will be necessary to traverse a number of tree levels equal to the + * maximum legal number of domain name labels (127); in practice, the number of + * tree levels to traverse will virtually always be much smaller (a few levels + * at most). While holding the tree lock throughout this entire operation is + * less than ideal, so is splitting the latter up by queueing a separate + * prune_tree() run for each node to start pruning from (as queueing requires + * allocating memory and can therefore potentially be exploited to exhaust + * available memory). Also note that actually freeing up the memory used by + * RBTDB nodes (which is what this function does) is essential to keeping cache + * memory use in check, so since the tree lock needs to be acquired anyway, + * freeing as many nodes as possible before the tree lock gets released is + * prudent. */ static void prune_tree(void *arg) { - prune_t *prune = (prune_t *)arg; - dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)prune->db; - dns_rbtnode_t *node = prune->node; + dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)arg; + dns_rbtnode_t *node = NULL; dns_rbtnode_t *parent = NULL; unsigned int locknum; isc_rwlocktype_t tlocktype = isc_rwlocktype_none; isc_rwlocktype_t nlocktype = isc_rwlocktype_none; - isc_mem_put(rbtdb->common.mctx, prune, sizeof(*prune)); - TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype); - locknum = node->locknum; - NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype); - do { - parent = node->parent; - dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true, - true DNS__DB_FILELINE); - if (parent != NULL && parent->down == NULL) { - /* - * node was the only down child of the parent and has - * just been removed. We'll then need to examine the - * parent. Keep the lock if possible; otherwise, - * release the old lock and acquire one for the parent. - */ - if (parent->locknum != locknum) { - NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, - &nlocktype); - locknum = parent->locknum; - NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, - &nlocktype); + while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) { + locknum = node->locknum; + NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype); + do { + if (ISC_LINK_LINKED(node, prunelink)) { + ISC_LIST_UNLINK(rbtdb->prunenodes, node, + prunelink); } - /* - * We need to gain a reference to the node before - * decrementing it in the next iteration. - */ - if (ISC_LINK_LINKED(parent, deadlink)) { - ISC_LIST_UNLINK(rbtdb->deadnodes[locknum], + parent = node->parent; + dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, + &tlocktype, true, + true DNS__DB_FILELINE); + + if (parent != NULL && parent->down == NULL) { + /* + * node was the only down child of the parent + * and has just been removed. We'll then need + * to examine the parent. Keep the lock if + * possible; otherwise, release the old lock and + * acquire one for the parent. + */ + if (parent->locknum != locknum) { + NODE_UNLOCK( + &rbtdb->node_locks[locknum].lock, + &nlocktype); + locknum = parent->locknum; + NODE_WRLOCK( + &rbtdb->node_locks[locknum].lock, + &nlocktype); + } + + /* + * We need to gain a reference to the node + * before decrementing it in the next iteration. + */ + if (ISC_LINK_LINKED(parent, deadlink)) { + ISC_LIST_UNLINK( + rbtdb->deadnodes[locknum], parent, deadlink); + } + dns__rbtdb_newref(rbtdb, parent, + nlocktype DNS__DB_FILELINE); + } else { + parent = NULL; } - dns__rbtdb_newref(rbtdb, parent, - nlocktype DNS__DB_FILELINE); - } else { - parent = NULL; - } - node = parent; - } while (node != NULL); - NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype); + node = parent; + } while (node != NULL); + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype); + } TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype); dns_db_detach((dns_db_t **)&rbtdb); @@ -3858,6 +3892,8 @@ dns__rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type, ISC_LIST_INIT(rbtdb->deadnodes[i]); } + ISC_LIST_INIT(rbtdb->prunenodes); + rbtdb->active = rbtdb->node_lock_count; for (i = 0; i < (int)(rbtdb->node_lock_count); i++) { diff --git a/lib/dns/rbtdb_p.h b/lib/dns/rbtdb_p.h index 1aa5672533..84ab5ec4d8 100644 --- a/lib/dns/rbtdb_p.h +++ b/lib/dns/rbtdb_p.h @@ -296,6 +296,10 @@ struct dns_rbtdb { */ dns_rbtnodelist_t *deadnodes; + /* List of nodes from which recursive tree pruning can be started from. + * Locked by tree_lock. */ + dns_rbtnodelist_t prunenodes; + /* * Heaps. These are used for TTL based expiry in a cache, * or for zone resigning in a zone DB. hmctx is the memory @@ -342,14 +346,6 @@ typedef struct { isc_stdtime_t now; } rbtdb_load_t; -/*% - * Prune context - */ -typedef struct { - dns_db_t *db; - dns_rbtnode_t *node; -} prune_t; - extern dns_dbmethods_t dns__rbtdb_zonemethods; extern dns_dbmethods_t dns__rbtdb_cachemethods;