diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index 76c69afdfb..d567c99d80 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -1184,7 +1184,7 @@ dns__rbtdb_newref(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, * The tree lock must be held for the result to be valid. */ static bool -is_leaf(dns_rbtnode_t *node) { +is_last_node_on_its_level(dns_rbtnode_t *node) { return (node->parent != NULL && node->parent->down == node && node->left == NULL && node->right == NULL); } @@ -1231,7 +1231,7 @@ cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum DNS__DB_FLARG) { continue; } - if (is_leaf(node) && rbtdb->loop != NULL) { + if (is_last_node_on_its_level(node) && rbtdb->loop != NULL) { send_to_prune_tree( rbtdb, node, isc_rwlocktype_write DNS__DB_FLARG_PASS); @@ -1449,29 +1449,32 @@ dns__rbtdb_decref(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, if (write_locked) { /* - * If this node is the only one in the level it's in, deleting - * this node may recursively make its parent the only node in - * the parent level; if so, and if no one is currently using - * the parent node, this is almost the only opportunity to - * clean it up. But the recursive cleanup is not that trivial - * since the child and parent may be in different lock buckets, - * which would cause a lock order reversal problem. To avoid - * the trouble, we'll dispatch a separate event for batch - * cleaning. We need to check whether we're deleting the node - * as a result of pruning to avoid infinite dispatching. - * Note: pruning happens only when a loop has been set for the - * rbtdb. If the user of the rbtdb chooses not to set a loop, - * it's their responsibility to purge stale leaves (e.g. by - * periodic walk-through). + * If this node is the only one left on its RBTDB level, + * attempt pruning the RBTDB (i.e. deleting empty nodes that + * are ancestors of 'node' and are not interior nodes) starting + * from this node (see prune_tree()). The main reason this is + * not done immediately, but asynchronously, is that the + * ancestors of 'node' are almost guaranteed to belong to + * different node buckets and we don't want to do juggle locks + * right now. + * + * Since prune_tree() also calls dns__rbtdb_decref(), check the + * value of the 'pruning' parameter (which is only set to + * 'true' in the dns__rbtdb_decref() call present in + * prune_tree()) to prevent an infinite loop and to allow a + * node sent to prune_tree() to be deleted by the delete_node() + * call in the code branch below. */ - - if (!pruning && is_leaf(node) && rbtdb->loop != NULL) { + if (!pruning && is_last_node_on_its_level(node) && + rbtdb->loop != NULL) + { send_to_prune_tree(rbtdb, node, *nlocktypep DNS__DB_FLARG_PASS); no_reference = false; } else { - /* We can now delete the node. */ - + /* + * The node can now be deleted. + */ delete_node(rbtdb, node); } } else { @@ -1494,10 +1497,27 @@ restore_locks: } /* - * Prune the tree by cleaning up single leaves. A single execution of this - * function cleans up a single node; if the parent of the latter becomes a - * single leaf on its own level as a result, the parent is then also sent to - * this function. + * Prune the RBTDB tree of trees. Start by attempting to delete a node that is + * the only one left on its RBTDB level (see the send_to_prune_tree() call in + * dns__rbtdb_decref()). Then, if the node has a parent (which can either + * exist on the same RBTDB level or on an upper RBTDB level), check whether the + * latter is an interior node (i.e. a node with a non-NULL 'down' pointer). If + * the parent node is not an interior node, attempt deleting the parent node as + * well and then move on to examining the parent node's parent, etc. Continue + * traversing the RBTDB tree until a node is encountered that is still an + * interior node after the previously-processed node gets deleted. + * + * It is acceptable for a node sent to this function to NOT be deleted in the + * process (e.g. if it gets reactivated in the meantime). Furthermore, node + * deletion is not a prerequisite for continuing RBTDB traversal. + * + * This function gets called once for every "starting node" and it continues + * traversing the RBTDB until the stop condition is met. In the worst case, + * the number of nodes processed by a single execution of this function is the + * number of tree levels, which is at most the maximum number of domain name + * labels (127); however, it should be much smaller in practice and deleting + * empty RBTDB nodes is critical to keeping the amount of memory used by the + * cache memory context within the configured limit anyway. */ static void prune_tree(void *arg) { @@ -1512,22 +1532,45 @@ prune_tree(void *arg) { isc_mem_put(rbtdb->common.mctx, prune, sizeof(*prune)); TREE_WRLOCK(&rbtdb->tree_lock, &tlocktype); - - parent = node->parent; - NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype); - dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true, - true DNS__DB_FILELINE); + do { + parent = node->parent; + dns__rbtdb_decref(rbtdb, node, 0, &nlocktype, &tlocktype, true, + true DNS__DB_FILELINE); + + /* + * Check whether the parent is an interior node. Note that it + * might have been one before the dns__rbtdb_decref() call on + * the previous line, but decrementing the reference count for + * 'node' could have caused 'node->parent->down' to become + * NULL. + */ + if (parent != NULL && parent->down == NULL) { + /* + * Keep the node lock if possible; otherwise, release + * the old lock and acquire one for the parent. + */ + if (parent->locknum != locknum) { + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, + &nlocktype); + locknum = parent->locknum; + NODE_WRLOCK(&rbtdb->node_locks[locknum].lock, + &nlocktype); + } + + /* + * We need to gain a reference to the parent node + * before decrementing it in the next iteration. + */ + dns__rbtdb_newref(rbtdb, parent, + nlocktype DNS__DB_FLARG_PASS); + } else { + parent = NULL; + } + + node = parent; + } while (node != NULL); NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype); - - if (parent != NULL && is_leaf(parent)) { - NODE_WRLOCK(&rbtdb->node_locks[parent->locknum].lock, - &nlocktype); - send_to_prune_tree(rbtdb, parent, nlocktype); - NODE_UNLOCK(&rbtdb->node_locks[parent->locknum].lock, - &nlocktype); - } - TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype); dns_db_detach((dns_db_t **)&rbtdb);