From 1233dc8a61c228775cafdc8565cb2f180cf1b428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Sun, 23 Feb 2025 14:36:35 +0100 Subject: [PATCH] Add isc_sieve unit implementing SIEVE-LRU algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the core implementation of the SIEVE algorithm described in the following paper: Zhang, Yazhuo, Juncheng Yang, Yao Yue, Ymir Vigfusson, and K V Rashmi. “SIEVE Is Simpler than LRU: An Efficient Turn-Key Eviction Algorithm for Web Caches,” n.d.. available online from https://junchengyang.com/publication/nsdi24-SIEVE.pdf --- lib/dns/include/dns/rdataslab.h | 18 +- lib/dns/qpcache.c | 513 ++++++++------------------------ lib/dns/rdataslab.c | 1 + lib/isc/Makefile.am | 1 + lib/isc/include/isc/sieve.h | 166 +++++++++++ tests/dns/qpdb_test.c | 12 + 6 files changed, 319 insertions(+), 392 deletions(-) create mode 100644 lib/isc/include/isc/sieve.h diff --git a/lib/dns/include/dns/rdataslab.h b/lib/dns/include/dns/rdataslab.h index 7f393b1e56..5f4c2cfba6 100644 --- a/lib/dns/include/dns/rdataslab.h +++ b/lib/dns/include/dns/rdataslab.h @@ -85,14 +85,13 @@ struct dns_slabheader { * when the "cyclic" rrset-order is required. */ - unsigned int resign_lsb : 1; + /* resigning (zone) and TTL-cleaning (cache) */ + uint16_t resign_lsb : 1; isc_stdtime_t resign; + isc_heap_t *heap; unsigned int heap_index; - /*%< - * Used for TTL-based cache cleaning. - */ - isc_stdtime_t last_used; + /* Used for stale refresh */ _Atomic(uint32_t) last_refresh_fail_ts; dns_slabheader_proof_t *noqname; @@ -127,7 +126,12 @@ struct dns_slabheader { * this rdataset, if any. */ + dns_gluelist_t *gluelist; + + /*% Used for SIEVE-LRU (cache) and changed_list (zone) */ ISC_LINK(struct dns_slabheader) link; + /*% Used for SIEVE-LRU */ + bool visited; /*% * Case vector. If the bit is set then the corresponding @@ -135,10 +139,6 @@ struct dns_slabheader { * rendering that character upper case. */ unsigned char upper[32]; - - isc_heap_t *heap; - - dns_gluelist_t *gluelist; }; enum { diff --git a/lib/dns/qpcache.c b/lib/dns/qpcache.c index 1391945c02..1b50608203 100644 --- a/lib/dns/qpcache.c +++ b/lib/dns/qpcache.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -34,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -127,20 +127,6 @@ */ #define QPDB_VIRTUAL 300 -/*% - * Whether to rate-limit updating the LRU to avoid possible thread contention. - * Updating LRU requires write locking, so we don't do it every time the - * record is touched - only after some time passes. - */ -#ifndef DNS_QPDB_LIMITLRUUPDATE -#define DNS_QPDB_LIMITLRUUPDATE 1 -#endif - -/*% Time after which we update LRU for glue records, 5 minutes */ -#define DNS_QPDB_LRUUPDATE_GLUE 300 -/*% Time after which we update LRU for all other records, 10 minutes */ -#define DNS_QPDB_LRUUPDATE_REGULAR 600 - /* * This defines the number of headers that we try to expire each time the * expire_ttl_headers() is run. The number should be small enough, so the @@ -150,7 +136,8 @@ #define DNS_QPDB_EXPIRE_TTL_COUNT 10 /*% - * This is the structure that is used for each node in the qp trie of trees. + * This is the structure that is used for each node in the qp trie of + * trees. */ typedef struct qpcnode qpcnode_t; struct qpcnode { @@ -224,11 +211,6 @@ typedef struct qpcache_bucket { /* Per-bucket lock. */ isc_rwlock_t lock; - /* - * Linked list used to implement LRU cache cleaning. - */ - dns_slabheaderlist_t lru; - /* * The heap is used for TTL based expiry. Note that qpcache->hmctx * is the memory context to use for heap memory; this differs from @@ -236,10 +218,14 @@ typedef struct qpcache_bucket { */ isc_heap_t *heap; + /* SIEVE-LRU cache cleaning state. */ + ISC_SIEVE(dns_slabheader_t) sieve; + /* Padding to prevent false sharing between locks. */ uint8_t __padding[ISC_OS_CACHELINE_SIZE - (sizeof(isc_queue_t) + sizeof(isc_rwlock_t) + - sizeof(dns_slabheaderlist_t) + sizeof(isc_heap_t *)) % + sizeof(isc_heap_t *) + + sizeof(ISC_SIEVE(dns_slabheader_t))) % ISC_OS_CACHELINE_SIZE]; } qpcache_bucket_t; @@ -285,17 +271,6 @@ struct qpcache { */ uint32_t serve_stale_refresh; - /* - * Start point % node_lock_count for next LRU cleanup. - */ - atomic_uint lru_sweep; - - /* - * When performing LRU cleaning limit cleaning to headers that were - * last used at or before this. - */ - _Atomic(isc_stdtime_t) last_used; - /* Locked by tree_lock. */ dns_qp_t *tree; dns_qp_t *nsec; @@ -457,6 +432,9 @@ qpcache__destroy(qpcache_t *qpdb); static dns_dbmethods_t qpdb_cachemethods; +static void +cleanup_deadnodes_cb(void *arg); + /*% * 'init_count' is used to initialize 'newheader->count' which in turn * is used to determine where in the cycle rrset-order cyclic starts. @@ -480,116 +458,84 @@ static atomic_uint_fast16_t init_count = 0; * Failure to follow this hierarchy can result in deadlock. */ -/*% - * Routines for LRU-based cache management. - */ - -/*% - * See if a given cache entry that is being reused needs to be updated - * in the LRU-list. From the LRU management point of view, this function is - * expected to return true for almost all cases. When used with threads, - * however, this may cause a non-negligible performance penalty because a - * writer lock will have to be acquired before updating the list. - * If DNS_QPDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this - * function returns true if the entry has not been updated for some period of - * time. We differentiate the NS or glue address case and the others since - * experiments have shown that the former tends to be accessed relatively - * infrequently and the cost of cache miss is higher (e.g., a missing NS records - * may cause external queries at a higher level zone, involving more - * transactions). - * - * Caller must hold the node (read or write) lock. - */ -static bool -need_headerupdate(dns_slabheader_t *header, isc_stdtime_t now) { - if (DNS_SLABHEADER_GETATTR(header, (DNS_SLABHEADERATTR_NONEXISTENT | - DNS_SLABHEADERATTR_ANCIENT | - DNS_SLABHEADERATTR_ZEROTTL)) != 0) - { - return false; - } - -#if DNS_QPDB_LIMITLRUUPDATE - if (header->type == dns_rdatatype_ns || - (header->trust == dns_trust_glue && - dns_rdatatype_isaddr(header->type))) - { - /* - * Glue records are updated if at least DNS_QPDB_LRUUPDATE_GLUE - * seconds have passed since the previous update time. - */ - return header->last_used + DNS_QPDB_LRUUPDATE_GLUE <= now; - } - - /* - * Other records are updated if DNS_QPDB_LRUUPDATE_REGULAR seconds - * have passed. - */ - return header->last_used + DNS_QPDB_LRUUPDATE_REGULAR <= now; -#else - UNUSED(now); - - return true; -#endif /* if DNS_QPDB_LIMITLRUUPDATE */ -} - -/*% - * Update the timestamp of a given cache entry and move it to the head - * of the corresponding LRU list. - * - * Caller must hold the node (write) lock. - * - * Note that the we do NOT touch the heap here, as the TTL has not changed. - */ -static void -update_header(qpcache_t *qpdb, dns_slabheader_t *header, isc_stdtime_t now) { - /* To be checked: can we really assume this? XXXMLG */ - INSIST(ISC_LINK_LINKED(header, link)); - - ISC_LIST_UNLINK(qpdb->buckets[HEADERNODE(header)->locknum].lru, header, - link); - header->last_used = now; - ISC_LIST_PREPEND(qpdb->buckets[HEADERNODE(header)->locknum].lru, header, - link); -} - -static void -maybe_update_headers(qpcache_t *qpdb, dns_slabheader_t *found, - dns_slabheader_t *foundsig, isc_rwlock_t *nlock, - isc_rwlocktype_t *nlocktypep, isc_stdtime_t now) { - if (need_headerupdate(found, now) || - (foundsig != NULL && need_headerupdate(foundsig, now))) - { - if (*nlocktypep != isc_rwlocktype_write) { - NODE_FORCEUPGRADE(nlock, nlocktypep); - } - if (need_headerupdate(found, now)) { - update_header(qpdb, found, now); - } - if (foundsig != NULL && need_headerupdate(foundsig, now)) { - update_header(qpdb, foundsig, now); - } - } -} - /* - * Locking: - * If a routine is going to lock more than one lock in this module, then - * the locking must be done in the following order: - * - * Tree Lock - * - * Node Lock (Only one from the set may be locked at one time by - * any caller) - * - * Database Lock - * - * Failure to follow this hierarchy can result in deadlock. - * - * Deleting Nodes: - * For zone databases the node for the origin of the zone MUST NOT be deleted. + * Cache-eviction routines. */ +static void +expireheader(dns_slabheader_t *header, isc_rwlocktype_t *nlocktypep, + isc_rwlocktype_t *tlocktypep, dns_expire_t reason DNS__DB_FLARG); + +static size_t +rdataset_size(dns_slabheader_t *header) { + if (EXISTS(header)) { + return dns_rdataslab_size(header); + } + + return sizeof(*header); +} + +static void +expire_lru_headers(qpcache_t *qpdb, uint32_t idx, size_t requested, + isc_rwlocktype_t *nlocktypep, + isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) { + size_t expired = 0; + + do { + dns_slabheader_t *header = + ISC_SIEVE_NEXT(qpdb->buckets[idx].sieve, visited, link); + if (header == NULL) { + return; + } + + ISC_SIEVE_UNLINK(qpdb->buckets[idx].sieve, header, link); + + expired += rdataset_size(header); + + expireheader(header, nlocktypep, tlocktypep, + dns_expire_lru DNS__DB_FLARG_PASS); + } while (expired < requested); +} + +static void +qpcache_miss(qpcache_t *qpdb, dns_slabheader_t *newheader, + isc_rwlocktype_t *nlocktypep, + isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) { + uint32_t idx = HEADERNODE(newheader)->locknum; + + isc_heap_insert(qpdb->buckets[idx].heap, newheader); + newheader->heap = qpdb->buckets[idx].heap; + + if (isc_mem_isovermem(qpdb->common.mctx)) { + /* + * Maximum estimated size of the data being added: The size + * of the rdataset, plus a new QP database node and nodename, + * and a possible additional NSEC node and nodename. Also add + * a 12k margin for a possible QP-trie chunk allocation. + * (It's okay to overestimate, we want to get cache memory + * down quickly.) + */ + + size_t purgesize = + 2 * (sizeof(qpcnode_t) + + dns_name_size(&HEADERNODE(newheader)->name)) + + rdataset_size(newheader) + 12288; + + expire_lru_headers(qpdb, idx, purgesize, nlocktypep, + tlocktypep DNS__DB_FLARG_PASS); + } + + ISC_SIEVE_INSERT(qpdb->buckets[idx].sieve, newheader, link); +} + +static void +qpcache_hit(qpcache_t *qpdb ISC_ATTR_UNUSED, dns_slabheader_t *header) { + /* + * On cache hit, we only mark the header as seen. + */ + ISC_SIEVE_MARK(header, visited); +} + /* * DB Routines */ @@ -733,9 +679,6 @@ qpcnode_acquire(qpcache_t *qpdb, qpcnode_t *node, isc_rwlocktype_t nlocktype, tlocktype DNS__DB_FLARG_PASS); } -static void -cleanup_deadnodes(void *arg); - /* * Decrement the external references to a node. If the counter * goes to zero, decrement the node use counter in the qpcache object @@ -838,7 +781,8 @@ qpcnode_release(qpcache_t *qpdb, qpcnode_t *node, isc_rwlocktype_t *nlocktypep, isc_loop_t *loop = isc_loop_get(qpdb->loopmgr, node->locknum); - isc_async_run(loop, cleanup_deadnodes, qpdb); + qpcache_ref(qpdb); + isc_async_run(loop, cleanup_deadnodes_cb, qpdb); } } @@ -921,13 +865,6 @@ setttl(dns_slabheader_t *header, isc_stdtime_t newts) { header->expire = newts; - if (header->db == NULL || !dns_db_iscache(header->db)) { - return; - } - - /* - * This is a cache. Adjust the heaps if necessary. - */ if (header->heap == NULL || header->heap_index == 0 || newts == oldts) { return; } @@ -1130,9 +1067,11 @@ bindrdatasets(qpcache_t *qpdb, qpcnode_t *qpnode, dns_slabheader_t *found, dns_rdataset_t *sigrdataset DNS__DB_FLARG) { bindrdataset(qpdb, qpnode, found, now, nlocktype, tlocktype, rdataset DNS__DB_FLARG_PASS); + qpcache_hit(qpdb, found); if (!NEGATIVE(found) && foundsig != NULL) { bindrdataset(qpdb, qpnode, foundsig, now, nlocktype, tlocktype, sigrdataset DNS__DB_FLARG_PASS); + qpcache_hit(qpdb, foundsig); } } @@ -1172,9 +1111,6 @@ setup_delegation(qpc_search_t *search, dns_dbnode_t **nodep, search->zonecut_sigheader, search->now, nlocktype, tlocktype, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(search->qpdb, search->zonecut_header, - search->zonecut_sigheader, nlock, - &nlocktype, search->now); NODE_UNLOCK(nlock, &nlocktype); } @@ -1411,8 +1347,6 @@ find_deepest_zonecut(qpc_search_t *search, qpcnode_t *node, search->now, nlocktype, isc_rwlocktype_none, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(search->qpdb, found, foundsig, - nlock, &nlocktype, search->now); } NODE_UNLOCK(nlock, &nlocktype); @@ -1505,8 +1439,6 @@ find_coveringnsec(qpc_search_t *search, const dns_name_t *name, bindrdatasets(search->qpdb, node, found, foundsig, search->now, nlocktype, isc_rwlocktype_none, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(search->qpdb, found, foundsig, nlock, - &nlocktype, search->now); dns_name_copy(fname, foundname); result = DNS_R_COVERINGNSEC; @@ -1796,8 +1728,6 @@ qpcache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version, bindrdatasets(search.qpdb, node, nsecheader, nsecsig, search.now, nlocktype, tlocktype, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(search.qpdb, nsecheader, nsecsig, - nlock, &nlocktype, search.now); result = DNS_R_COVERINGNSEC; goto node_exit; } @@ -1831,8 +1761,6 @@ qpcache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version, bindrdatasets(search.qpdb, node, nsheader, nssig, search.now, nlocktype, tlocktype, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(search.qpdb, nsheader, nssig, - nlock, &nlocktype, search.now); result = DNS_R_DELEGATION; goto node_exit; } @@ -1885,8 +1813,6 @@ qpcache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version, bindrdatasets(search.qpdb, node, found, foundsig, search.now, nlocktype, tlocktype, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(search.qpdb, found, foundsig, nlock, - &nlocktype, search.now); } node_exit: @@ -1978,8 +1904,6 @@ seek_ns_headers(qpc_search_t *search, qpcnode_t *node, dns_dbnode_t **nodep, bindrdatasets(search->qpdb, node, found, foundsig, search->now, nlocktype, *tlocktype, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(search->qpdb, found, foundsig, nlock, &nlocktype, - search->now); NODE_UNLOCK(nlock, &nlocktype); @@ -2115,8 +2039,6 @@ qpcache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, bindrdatasets(qpdb, qpnode, found, foundsig, search.now, nlocktype, isc_rwlocktype_none, rdataset, sigrdataset DNS__DB_FLARG_PASS); - maybe_update_headers(qpdb, found, foundsig, nlock, &nlocktype, - search.now); } NODE_UNLOCK(nlock, &nlocktype); @@ -2219,114 +2141,6 @@ expiredata(dns_db_t *db, dns_dbnode_t *node, void *data) { INSIST(tlocktype == isc_rwlocktype_none); } -static size_t -rdataset_size(dns_slabheader_t *header) { - if (EXISTS(header)) { - return dns_rdataslab_size(header); - } - - return sizeof(*header); -} - -static size_t -expire_lru_headers(qpcache_t *qpdb, unsigned int locknum, - isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep, - size_t purgesize DNS__DB_FLARG) { - dns_slabheader_t *header = NULL; - size_t purged = 0; - - for (header = ISC_LIST_TAIL(qpdb->buckets[locknum].lru); - header != NULL && header->last_used <= qpdb->last_used && - purged <= purgesize; - header = ISC_LIST_TAIL(qpdb->buckets[locknum].lru)) - { - size_t header_size = rdataset_size(header); - - /* - * Unlink the entry at this point to avoid checking it - * again even if it's currently used someone else and - * cannot be purged at this moment. This entry won't be - * referenced any more (so unlinking is safe) since the - * TTL will be reset to 0. - */ - ISC_LIST_UNLINK(qpdb->buckets[locknum].lru, header, link); - expireheader(header, nlocktypep, tlocktypep, - dns_expire_lru DNS__DB_FLARG_PASS); - purged += header_size; - } - - return purged; -} - -/*% - * Purge some expired and/or stale (i.e. unused for some period) cache entries - * due to an overmem condition. To recover from this condition quickly, - * we clean up entries up to the size of newly added rdata that triggered - * the overmem; this is accessible via newheader. - * - * The LRU lists tails are processed in LRU order to the nearest second. - * - * A write lock on the tree must be held. - */ -static void -overmem(qpcache_t *qpdb, dns_slabheader_t *newheader, - isc_rwlocktype_t *tlocktypep DNS__DB_FLARG) { - uint32_t locknum_start = qpdb->lru_sweep++ % qpdb->buckets_count; - uint32_t locknum = locknum_start; - size_t purgesize, purged = 0; - isc_stdtime_t min_last_used = 0; - size_t max_passes = 8; - - /* - * Maximum estimated size of the data being added: The size - * of the rdataset, plus a new QP database node and nodename, - * and a possible additional NSEC node and nodename. Also add - * a 12k margin for a possible QP-trie chunk allocation. - * (It's okay to overestimate, we want to get cache memory - * down quickly.) - */ - purgesize = 2 * (sizeof(qpcnode_t) + - dns_name_size(&HEADERNODE(newheader)->name)) + - rdataset_size(newheader) + 12288; -again: - do { - isc_rwlocktype_t nlocktype = isc_rwlocktype_none; - isc_rwlock_t *nlock = &qpdb->buckets[locknum].lock; - NODE_WRLOCK(nlock, &nlocktype); - - purged += expire_lru_headers( - qpdb, locknum, &nlocktype, tlocktypep, - purgesize - purged DNS__DB_FLARG_PASS); - - /* - * Work out the oldest remaining last_used values of the list - * tails as we walk across the array of lru lists. - */ - dns_slabheader_t *header = - ISC_LIST_TAIL(qpdb->buckets[locknum].lru); - if (header != NULL && - (min_last_used == 0 || header->last_used < min_last_used)) - { - min_last_used = header->last_used; - } - NODE_UNLOCK(nlock, &nlocktype); - locknum = (locknum + 1) % qpdb->buckets_count; - } while (locknum != locknum_start && purged <= purgesize); - - /* - * Update qpdb->last_used if we have walked all the list tails and have - * not freed the required amount of memory. - */ - if (purged < purgesize) { - if (min_last_used != 0) { - qpdb->last_used = min_last_used; - if (max_passes-- > 0) { - goto again; - } - } - } -} - /*% * These functions allow the heap code to rank the priority of each * element. It returns true if v1 happens "sooner" than v2. @@ -2385,7 +2199,7 @@ qpcache__destroy(qpcache_t *qpdb) { for (i = 0; i < qpdb->buckets_count; i++) { NODE_DESTROYLOCK(&qpdb->buckets[i].lock); - INSIST(ISC_LIST_EMPTY(qpdb->buckets[i].lru)); + INSIST(ISC_SIEVE_EMPTY(qpdb->buckets[i].sieve)); INSIST(isc_queue_empty(&qpdb->buckets[i].deadnodes)); isc_queue_destroy(&qpdb->buckets[i].deadnodes); @@ -2428,9 +2242,7 @@ qpcache_destroy(dns_db_t *arg) { * to wait for the tree write lock. */ static void -cleanup_deadnodes(void *arg) { - qpcache_t *qpdb = arg; - uint16_t locknum = isc_tid(); +cleanup_deadnodes(qpcache_t *qpdb, uint16_t locknum) { isc_rwlocktype_t tlocktype = isc_rwlocktype_none; isc_rwlocktype_t nlocktype = isc_rwlocktype_none; isc_rwlock_t *nlock = &qpdb->buckets[locknum].lock; @@ -2444,8 +2256,7 @@ cleanup_deadnodes(void *arg) { TREE_WRLOCK(&qpdb->tree_lock, &tlocktype); NODE_WRLOCK(nlock, &nlocktype); - RUNTIME_CHECK(isc_queue_splice(&deadnodes, - &qpdb->buckets[locknum].deadnodes)); + isc_queue_splice(&deadnodes, &qpdb->buckets[locknum].deadnodes); isc_queue_for_each_entry_safe(&deadnodes, qpnode, qpnext, deadlink) { qpcnode_release(qpdb, qpnode, &nlocktype, &tlocktype DNS__DB_FILELINE); @@ -2455,6 +2266,14 @@ cleanup_deadnodes(void *arg) { TREE_UNLOCK(&qpdb->tree_lock, &tlocktype); } +static void +cleanup_deadnodes_cb(void *arg) { + qpcache_t *qpdb = arg; + uint16_t locknum = isc_tid(); + + cleanup_deadnodes(qpdb, locknum); + qpcache_unref(qpdb); +} /* * This function is assumed to be called when a node is newly referenced * and can be in the deadnode list. In that case the node will be references @@ -2657,7 +2476,6 @@ add(qpcache_t *qpdb, qpcnode_t *qpnode, dns_slabheader_t *prioheader = NULL, *expireheader = NULL; dns_typepair_t negtype = 0; dns_trust_t trust; - int idx; uint32_t ntypes = 0; if ((options & DNS_DBADD_FORCE) != 0) { @@ -2833,17 +2651,9 @@ find_header: if (header->expire > newheader->expire) { setttl(header, newheader->expire); } - if (header->last_used != now) { - ISC_LIST_UNLINK( - qpdb->buckets[HEADERNODE(header)->locknum] - .lru, - header, link); - header->last_used = now; - ISC_LIST_PREPEND( - qpdb->buckets[HEADERNODE(header)->locknum] - .lru, - header, link); - } + + qpcache_hit(qpdb, header); + if (header->noqname == NULL && newheader->noqname != NULL) { @@ -2895,17 +2705,9 @@ find_header: if (header->expire > newheader->expire) { setttl(header, newheader->expire); } - if (header->last_used != now) { - ISC_LIST_UNLINK( - qpdb->buckets[HEADERNODE(header)->locknum] - .lru, - header, link); - header->last_used = now; - ISC_LIST_PREPEND( - qpdb->buckets[HEADERNODE(header)->locknum] - .lru, - header, link); - } + + qpcache_hit(qpdb, header); + if (header->noqname == NULL && newheader->noqname != NULL) { @@ -2927,17 +2729,9 @@ find_header: return ISC_R_SUCCESS; } - idx = HEADERNODE(newheader)->locknum; - isc_heap_insert(qpdb->buckets[idx].heap, newheader); - newheader->heap = qpdb->buckets[idx].heap; - if (ZEROTTL(newheader)) { - newheader->last_used = qpdb->last_used + 1; - ISC_LIST_APPEND(qpdb->buckets[idx].lru, newheader, - link); - } else { - ISC_LIST_PREPEND(qpdb->buckets[idx].lru, newheader, - link); - } + qpcache_miss(qpdb, newheader, &nlocktype, + &tlocktype DNS__DB_FLARG_PASS); + if (topheader_prev != NULL) { topheader_prev->next = newheader; } else { @@ -2961,17 +2755,8 @@ find_header: /* No rdatasets of the given type exist at the node. */ INSIST(newheader->down == NULL); - idx = HEADERNODE(newheader)->locknum; - isc_heap_insert(qpdb->buckets[idx].heap, newheader); - newheader->heap = qpdb->buckets[idx].heap; - if (ZEROTTL(newheader)) { - ISC_LIST_APPEND(qpdb->buckets[idx].lru, newheader, - link); - } else { - ISC_LIST_PREPEND(qpdb->buckets[idx].lru, newheader, - link); - } - + qpcache_miss(qpdb, newheader, &nlocktype, + &tlocktype DNS__DB_FLARG_PASS); if (prio_header(newheader)) { /* This is a priority type, prepend it */ newheader->next = qpnode->data; @@ -3097,7 +2882,7 @@ cleanup: static void expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum, isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep, - isc_stdtime_t now, bool cache_is_overmem DNS__DB_FLARG); + isc_stdtime_t now DNS__DB_FLARG); static isc_result_t qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, @@ -3114,7 +2899,6 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, isc_rwlocktype_t tlocktype = isc_rwlocktype_none; isc_rwlocktype_t nlocktype = isc_rwlocktype_none; isc_rwlock_t *nlock = NULL; - bool cache_is_overmem = false; dns_fixedname_t fixed; dns_name_t *name = NULL; isc_stdtime_t now = __now ? __now : isc_stdtime_now(); @@ -3140,8 +2924,6 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader = (dns_slabheader_t *)region.base; dns_slabheader_reset(newheader, db, node); - newheader->last_used = now; - /* * By default, dns_rdataslab_fromrdataset() sets newheader->ttl * to the rdataset TTL. In the case of the cache, that's wrong; @@ -3195,34 +2977,17 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, /* * Add to the auxiliary NSEC tree if we're adding an NSEC record. */ - TREE_RDLOCK(&qpdb->tree_lock, &tlocktype); - if (qpnode->nsec != DNS_DB_NSEC_HAS_NSEC && - rdataset->type == dns_rdatatype_nsec) - { - newnsec = true; - } else { - newnsec = false; - } - TREE_UNLOCK(&qpdb->tree_lock, &tlocktype); + newnsec = (qpnode->nsec != DNS_DB_NSEC_HAS_NSEC && + rdataset->type == dns_rdatatype_nsec); /* - * If we're adding a delegation type, adding to the auxiliary NSEC - * tree, or the DB is a cache in an overmem state, hold an - * exclusive lock on the tree. In the latter case the lock does - * not necessarily have to be acquired but it will help purge - * ancient entries more effectively. + * If we're adding a delegation type or adding to the auxiliary + * NSEC tree, hold an exclusive lock on the tree. */ - if (isc_mem_isovermem(qpdb->common.mctx)) { - cache_is_overmem = true; - } - if (delegating || newnsec || cache_is_overmem) { + if (delegating || newnsec) { TREE_WRLOCK(&qpdb->tree_lock, &tlocktype); } - if (cache_is_overmem) { - overmem(qpdb, newheader, &tlocktype DNS__DB_FLARG_PASS); - } - nlock = &qpdb->buckets[qpnode->locknum].lock; NODE_WRLOCK(nlock, &nlocktype); @@ -3234,27 +2999,15 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, true); } - expire_ttl_headers(qpdb, qpnode->locknum, &nlocktype, &tlocktype, now, - cache_is_overmem DNS__DB_FLARG_PASS); + expire_ttl_headers(qpdb, qpnode->locknum, &nlocktype, &tlocktype, + now DNS__DB_FLARG_PASS); - /* - * If we've been holding a write lock on the tree just for - * cleaning, we can release it now. However, we still need the - * node lock. - */ - if (tlocktype == isc_rwlocktype_write && !delegating && !newnsec) { - TREE_UNLOCK(&qpdb->tree_lock, &tlocktype); - } - - result = ISC_R_SUCCESS; if (newnsec) { qpcnode_t *nsecnode = NULL; result = dns_qp_getname(qpdb->nsec, name, (void **)&nsecnode, NULL); - if (result == ISC_R_SUCCESS) { - result = ISC_R_SUCCESS; - } else { + if (result != ISC_R_SUCCESS) { INSIST(nsecnode == NULL); nsecnode = new_qpcnode(qpdb, name); nsecnode->nsec = DNS_DB_NSEC_NSEC; @@ -3265,11 +3018,9 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, qpnode->nsec = DNS_DB_NSEC_HAS_NSEC; } - if (result == ISC_R_SUCCESS) { - result = add(qpdb, qpnode, name, newheader, options, - addedrdataset, now, nlocktype, - tlocktype DNS__DB_FLARG_PASS); - } + result = add(qpdb, qpnode, name, newheader, options, addedrdataset, now, + nlocktype, tlocktype DNS__DB_FLARG_PASS); + if (result == ISC_R_SUCCESS && delegating) { qpnode->delegating = 1; } @@ -3279,6 +3030,7 @@ qpcache_addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, if (tlocktype != isc_rwlocktype_none) { TREE_UNLOCK(&qpdb->tree_lock, &tlocktype); } + INSIST(tlocktype == isc_rwlocktype_none); return result; @@ -3402,7 +3154,7 @@ dns__qpcache_create(isc_mem_t *mctx, const dns_name_t *origin, dns_rdatasetstats_create(mctx, &qpdb->rrsetstats); for (i = 0; i < (int)qpdb->buckets_count; i++) { - ISC_LIST_INIT(qpdb->buckets[i].lru); + ISC_SIEVE_INIT(qpdb->buckets[i].sieve); qpdb->buckets[i].heap = NULL; isc_heap_create(hmctx, ttl_sooner, set_index, 0, @@ -3911,6 +3663,7 @@ deletedata(dns_db_t *db ISC_ATTR_UNUSED, dns_dbnode_t *node ISC_ATTR_UNUSED, void *data) { dns_slabheader_t *header = data; qpcache_t *qpdb = (qpcache_t *)header->db; + int idx = HEADERNODE(header)->locknum; if (header->heap != NULL && header->heap_index != 0) { isc_heap_delete(header->heap, header->heap_index); @@ -3920,8 +3673,7 @@ deletedata(dns_db_t *db ISC_ATTR_UNUSED, dns_dbnode_t *node ISC_ATTR_UNUSED, atomic_load_acquire(&header->attributes), false); if (ISC_LINK_LINKED(header, link)) { - int idx = HEADERNODE(header)->locknum; - ISC_LIST_UNLINK(qpdb->buckets[idx].lru, header, link); + ISC_SIEVE_UNLINK(qpdb->buckets[idx].sieve, header, link); } if (header->noqname != NULL) { @@ -3938,7 +3690,7 @@ deletedata(dns_db_t *db ISC_ATTR_UNUSED, dns_dbnode_t *node ISC_ATTR_UNUSED, static void expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum, isc_rwlocktype_t *nlocktypep, isc_rwlocktype_t *tlocktypep, - isc_stdtime_t now, bool cache_is_overmem DNS__DB_FLARG) { + isc_stdtime_t now DNS__DB_FLARG) { isc_heap_t *heap = qpdb->buckets[locknum].heap; for (size_t i = 0; i < DNS_QPDB_EXPIRE_TTL_COUNT; i++) { @@ -3949,12 +3701,7 @@ expire_ttl_headers(qpcache_t *qpdb, unsigned int locknum, return; } - dns_ttl_t ttl = header->expire; - - if (!cache_is_overmem) { - /* Only account for stale TTL if cache is not overmem */ - ttl += STALE_TTL(header, qpdb); - } + dns_ttl_t ttl = header->expire + STALE_TTL(header, qpdb); if (ttl >= now - QPDB_VIRTUAL) { /* diff --git a/lib/dns/rdataslab.c b/lib/dns/rdataslab.c index 1aaafd0ba2..e7c5539458 100644 --- a/lib/dns/rdataslab.c +++ b/lib/dns/rdataslab.c @@ -865,6 +865,7 @@ dns_slabheader_reset(dns_slabheader_t *h, dns_db_t *db, dns_dbnode_t *node) { h->heap = NULL; h->db = db; h->node = node; + h->visited = false; atomic_init(&h->attributes, 0); atomic_init(&h->last_refresh_fail_ts, 0); diff --git a/lib/isc/Makefile.am b/lib/isc/Makefile.am index 27cfe92f3e..1cdc540fb2 100644 --- a/lib/isc/Makefile.am +++ b/lib/isc/Makefile.am @@ -75,6 +75,7 @@ libisc_la_HEADERS = \ include/isc/rwlock.h \ include/isc/safe.h \ include/isc/serial.h \ + include/isc/sieve.h \ include/isc/signal.h \ include/isc/siphash.h \ include/isc/sockaddr.h \ diff --git a/lib/isc/include/isc/sieve.h b/lib/isc/include/isc/sieve.h new file mode 100644 index 0000000000..dfebeb55f1 --- /dev/null +++ b/lib/isc/include/isc/sieve.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#pragma once + +/*! \file isc/sieve.h */ + +/* + * Zhang, Yazhuo, Juncheng Yang, Yao Yue, Ymir Vigfusson, and K V Rashmi. + * “SIEVE Is Simpler than LRU: An Efficient Turn-Key Eviction Algorithm for + * Web Caches,” n.d. + * + * Algorithm 1 SIEVE + * + * Input: The request x, doubly-linked queue T , cache size C, hand p + * 1: if x is in T then ▷ Cache Hit + * 2: x.visited ←1 + * 3: else ▷ Cache Miss + * 4: if |T |= C then ▷ Cache Full + * 5: o ←p + * 6: if o is NULL then + * 7: o ←tail of T + * 8: while o.visited = 1 do + * 9: o.visited ←0 + * 10: o ←o.prev + * 11: if o is NULL then + * 12: o ←tail of T + * 13: p ←o.prev + * 14: Discard o in T ▷ Eviction + * 15: Insert x in the head of T . + * 16: x.visited ←0 ▷ Insertion + * + * Data structure. SIEVE requires only one FIFO queue and one pointer + * called “hand”. The queue maintains the insertion order between objects. + * Each object in the queue uses one bit to track the visited/non-visited + * status. The hand points to the next eviction candidate in the cache and + * moves from the tail to the head. Note that, unlike existing algorithms, + * e.g., LRU, FIFO, and CLOCK, in which the eviction candidate is always + * the tail object, the eviction candidate in SIEVE is an object somewhere + * in the queue. + * + * SIEVE operations. A cache hit in SIEVE changes the visited bit of the + * accessed object to 1. For a popular object whose visited bit is already + * 1, SIEVE does not need to perform any operation. During a cache miss, + * SIEVE examines the object pointed by the hand. If it has been visited, + * the visited bit is reset, and the hand moves to the next position (the + * retained object stays in the original position of the queue). It + * continues this process until it encounters an object with the visited + * bit being 0, and it evicts the object. After the eviction, the hand + * points to the next position (the previous object in the queue). While + * an evicted object is in the middle of the queue most of the time, a new + * object is always inserted into the head of the queue. In other words, + * the new objects and the retained objects are not mixed together. + * + * At first glance, SIEVE is similar to CLOCK/Second Chance/FIFO-Reinsertion. + * Each algorithm maintains a single queue in which each object is + * associated with a visited bit to track its access status. Visited + * objects are retained (also called "survived") during an eviction. + * Notably, new objects are inserted at the head of the queue in both SIEVE + * and FIFO-Reinsertion. However, the hand in SIEVE moves from the tail to + * the head over time, whereas the hand in FIFO-Reinsertion stays at the + * tail. The key difference is where a retained object is kept. SIEVE + * keeps it in the old position, while FIFO-Reinsertion inserts it at the + * head, together with newly inserted objects. + * + * We detail the algorithm in Alg. 1. Line 1 checks whether there is a + * hit, and if so, then line 2 sets the visited bit to one. In the case of + * a cache miss (Line 3), Lines 5-12 identify the object to be evicted. + * + * Lazy promotion and quick demotion. Despite a simple design, SIEVE + * effectively incorporates both lazy promotion and quick demotion. An + * object is only promoted at the eviction time in lazy promotion. SIEVE + * operates in a similar manner. However, rather than promoting the object + * to the head of the queue, SIEVE keeps the object at its original + * location. The "survived" objects are generally more popular than the + * evicted ones, thus, they are likely to be accessed again in the future. + * By gathering the "survived" objects, the hand in SIEVE can quickly move + * from the tail to the area near the head, where most objects are newly + * inserted. These newly inserted objects are quickly examined by the hand + * of SIEVE after they are admitted into the cache, thus achieving quick + * demotion. This eviction mechanism makes SIEVE achieve both lazy + * promotion and quick demotion with- out adding too much overhead. + * + * The key ingredient of SIEVE is the moving hand, which functions like an + * adaptive filter that removes unpopular objects from the cache. This + * mechanism enables SIEVE to strike a balance between finding new popular + * objects and keeping old popular objects. + */ + +#include + +#define ISC_SIEVE(type) \ + struct { \ + ISC_LIST(type) list; \ + type *hand; \ + } +#define ISC_SIEVE_INIT(sieve) \ + { \ + ISC_LIST_INIT((sieve).list); \ + (sieve).hand = NULL; \ + } +#define ISC_SIEVE_EMPTY(sieve) ISC_LIST_EMPTY((sieve).list) + +#define ISC_SIEVE_MARKED(entry, visited) CMM_LOAD_SHARED((entry)->visited) +#define ISC_SIEVE_MARK(entry, visited) \ + if (!ISC_SIEVE_MARKED(entry, visited)) { \ + CMM_STORE_SHARED((entry)->visited, true); \ + } +#define ISC_SIEVE_UNMARK(entry, visited) \ + CMM_STORE_SHARED((entry)->visited, false) + +/* + * Note: To match the original algorithm design, the + * SIEVE queue is iterated from tail to head. + */ +#define ISC_SIEVE_NEXT(sieve, visited, link) \ + ({ \ + __typeof__((sieve).hand) __hand = ((sieve).hand); \ + if (__hand == NULL && !ISC_LIST_EMPTY((sieve).list)) { \ + __hand = ISC_LIST_TAIL((sieve).list); \ + } \ + \ + while (__hand != NULL && ISC_SIEVE_MARKED(__hand, visited)) { \ + ISC_SIEVE_UNMARK(__hand, visited); \ + \ + __hand = ISC_LIST_PREV(__hand, link); \ + if (__hand == NULL) { \ + /* We know the queue is not empty */ \ + __hand = ISC_LIST_TAIL((sieve).list); \ + } \ + } \ + (sieve).hand = __hand; \ + __hand; \ + }) + +#define ISC_SIEVE_UNLINK(sieve, entry, link) \ + ({ \ + __typeof__((sieve).hand) __hand = (sieve).hand; \ + /* 1. Go to the previous node (possibly head of the list) */ \ + if (entry == __hand) { \ + __hand = ISC_LIST_PREV(entry, link); \ + } \ + \ + /* 2. Unlink the node from the list */ \ + ISC_LIST_UNLINK((sieve).list, entry, link); \ + \ + /* 3. We reached head, continue with tail again */ \ + if (__hand == NULL && !ISC_LIST_EMPTY((sieve).list)) { \ + __hand = ISC_LIST_TAIL((sieve).list); \ + } \ + \ + (sieve).hand = __hand; \ + }) + +#define ISC_SIEVE_INSERT(sieve, entry, link) \ + ISC_LIST_PREPEND((sieve).list, entry, link) diff --git a/tests/dns/qpdb_test.c b/tests/dns/qpdb_test.c index 24101becb9..4b1de18f7d 100644 --- a/tests/dns/qpdb_test.c +++ b/tests/dns/qpdb_test.c @@ -112,6 +112,16 @@ overmempurge_addrdataset(dns_db_t *db, isc_stdtime_t now, int idx, dns_db_detachnode(db, &node); } +static void +cleanup_all_deadnodes(dns_db_t *db) { + qpcache_t *qpdb = (qpcache_t *)db; + qpcache_ref(qpdb); + for (uint16_t locknum = 0; locknum < qpdb->buckets_count; locknum++) { + cleanup_deadnodes(qpdb, locknum); + } + qpcache_unref(qpdb); +} + ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) { size_t maxcache = 2097152U; /* 2MB - same as DNS_CACHE_MINSIZE */ size_t hiwater = maxcache - (maxcache >> 3); /* borrowed from cache.c */ @@ -150,6 +160,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) { */ while (i-- > 0) { overmempurge_addrdataset(db, now, i, 50054, 65535, false); + cleanup_all_deadnodes(db); if (verbose) { print_message("# inuse: %zd max: %zd\n", isc_mem_inuse(mctx2), maxcache); @@ -200,6 +211,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_longname) { */ while (i-- > 0) { overmempurge_addrdataset(db, now, i, 50054, 0, true); + cleanup_all_deadnodes(db); if (verbose) { print_message("# inuse: %zd max: %zd\n", isc_mem_inuse(mctx2), maxcache);