From 4b5ec07bb712d26d9cae7f3871f6e1bfd6f23b35 Mon Sep 17 00:00:00 2001
From: Tony Finch <fanf@isc.org>
Date: Thu, 22 Dec 2022 14:55:14 +0000
Subject: [PATCH] Refactor qp-trie to use QSBR

The first working multi-threaded qp-trie was stuck with an unpleasant
trade-off:

  * Use `isc_rwlock`, which has acceptable write performance, but
    terrible read scalability because the qp-trie made all accesses
    through a single lock.

  * Use `liburcu`, which has great read scalability, but terrible
    write performance, because I was relying on `rcu_synchronize()`
    which is rather slow. And `liburcu` is LGPL.

To get the best of both worlds, we need our own scalable read side,
which we now have with `isc_qsbr`. And we need to modify the write
side so that it is not blocked by readers.

Better write performance requires an async cleanup function like
`call_rcu()`, instead of the blocking `rcu_synchronize()`. (There
is no blocking cleanup in `isc_qsbr`, because I have concluded
that it would be an attractive nuisance.)

Until now, all my multithreading qp-trie designs have been based
around two versions, read-only and mutable. This is too few to
work with asynchronous cleanup. The bare minimum (as in epoch
based reclamation) is three, but it makes more sense to support an
arbitrary number. Doing multi-version support "properly" makes
fewer assumptions about how safe memory reclamation works, and it
makes snapshots and rollbacks simpler.

To avoid making the memory management even more complicated, I
have introduced a new kind of "packed reader node" to anchor the
root of a version of the trie. This is simpler because it re-uses
the existing chunk lifetime logic - see the discussion under
"packed reader nodes" in `qp_p.h`.

I have also made the chunk lifetime logic simpler. The idea of a
"generation" is gone; instead, chunks are either mutable or
immutable. And the QSBR phase number is used to indicate when a
chunk can be reclaimed.

Instead of the `shared_base` flag (which was basically a one-bit
reference count, with a two version limit) the base array now has a
refcount, which replaces the confusing ad-hoc lifetime logic with
something more familiar and systematic.
---
 CHANGES                  |   5 +
 doc/design/qp-trie.md    | 253 +++++-----
 fuzz/Makefile.am         |   1 +
 fuzz/dns_qp.c            |   1 +
 lib/dns/include/dns/qp.h | 144 +++---
 lib/dns/qp.c             | 963 ++++++++++++++++++++++-----------------
 lib/dns/qp_p.h           | 580 ++++++++++++++++-------
 tests/bench/Makefile.am  |   3 +
 tests/bench/load-names.c |   2 +
 tests/bench/qpmulti.c    | 912 ++++++++++++++++++++++++++++--------
 tests/dns/qp_test.c      |  20 +-
 tests/dns/qpmulti_test.c |  90 ++--
 tests/libtest/qp.c       | 115 ++---
 13 files changed, 2040 insertions(+), 1049 deletions(-)

diff --git a/CHANGES b/CHANGES
index 73695a8f25..06e54c6df8 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,8 @@
+6117.	[func]		Add a qp-trie data structure. This is a foundation for
+			our plan to replace, in stages, BIND's red-black tree.
+			The qp-trie has lock-free multithreaded reads, using
+			QSBR for safe memory reclamation. [GL !7130]
+
 6116.	[placeholder]
 
 6115.	[bug]		Unregister db update notify callback before detaching
diff --git a/doc/design/qp-trie.md b/doc/design/qp-trie.md
index a5f4d9b6dd..fb0d9e28eb 100644
--- a/doc/design/qp-trie.md
+++ b/doc/design/qp-trie.md
@@ -362,7 +362,7 @@ one 64 bit word and one 32-bit word.
 
 A branch node contains
 
-  * a branch/leaf tag bit
+  * two type tag bits
 
   * a 47-wide bitmap, with a bit for each common hostname character
     and each escape character
@@ -374,8 +374,8 @@ A branch node contains
     these references are described in more detail below
 
 A leaf node contains a pointer value (which we assume to be 64 bits)
-and a 32-bit integer value. The branch/leaf tag is smuggled into the
-low-order bit of the pointer value, so the pointer value must have
+and a 32-bit integer value. The type tag is smuggled into the
+low-order bits of the pointer value, so the pointer value must have
 large enough alignment. (This requirement is checked when a leaf is
 added to the trie.) Apart from that, the meaning of leaf values
 is entirely under control of the qp-trie user.
@@ -478,8 +478,8 @@ labels. This is slightly different from the root node, which tested the
 first character of the label; here we are testing the last character.
 
 
-memory management for concurrency
----------------------------------
+concurrency and transactions
+----------------------------
 
 The following sections discuss how the qp-trie supports concurrency.
 
@@ -487,12 +487,32 @@ The requirement is to support many concurrent read threads, and
 allow updates to occur without blocking readers (or blocking readers
 as little as possible).
 
+Concurrent access to a qp-trie uses a transactional API. There can be
+at most one writer at a time. When a writer commits its transaction
+(by atomically replacing the trie's root pointer) the changes become
+visible to readers. Read transactions ensure that memory is not
+reclaimed while readers are still using it.
+
+If there are relatively long read transactions and brief write
+transactions (though that is unlikely) there can be multiple versions
+of a qp-trie in use at a time.
+
+
+copy-on-write
+-------------
+
 The strategy is to use "copy-on-write", that is, when an update
 needs to alter the trie it makes a copy of the parts that it needs
 to change, so that concurrent readers can continue to use the
 original. (It is analogous to multiversion concurrency in databases
 such as PostgreSQL, where copy-on-write uses a write-ahead log.)
 
+The qp-trie only uses copy-on-write when the nodes that need to be
+altered can be shared with concurrent readers. After copying, the
+nodes are exclusive to the writer and can be updated in place. This
+reduces the pressure on the allocator a lot: pure copy-on-write
+allocates and discards memory at a ferocious rate.
+
 Software that uses copy-on-write needs some mechanism for clearing
 away old versions that are no longer in use. (For example, VACUUM in
 PostgreSQL.) The qp-trie code uses a custom allocator with a simple
@@ -567,27 +587,114 @@ garbage collector. Reference counting for value objects is handled
 by the `attach()` and `detach()` qp-trie methods.
 
 
-memory layout
--------------
+chunked memory layout
+---------------------
 
-BIND's qp-trie code organizes its memory as a collection of "chunks",
-each of which is a few pages in size and large enough to hold a few
-thousand nodes.
-
-Most memory management is per-chunk: obtaining memory from the
-system allocator and returning it; keeping track of which chunks are
-in use by readers, and which chunks can be mutated; and counting
-whether chunks are fragmented enough to need garbage collection.
+BIND's qp-trie code organizes its memory as a collection of "chunks"
+allocated by `malloc()`, each of which is a few pages in size and
+large enough to hold a thousand nodes or so.
 
 As noted above, we also use the chunk-based layout to reduce the size
 of interior nodes. Instead of using a native pointer (typically 64
 bits) to refer to a node, we use a 32 bit integer containing the chunk
 number and the position of the node in the chunk. This reduces the
-memory used by interior nodes by 25%.
+memory used for interior nodes by 25%. See the "helper types" section
+in `lib/dns/qp_p.h` for the relevant definitions.
+
+BIND stores each zone separately, and there can be a very large number
+of zones in a server. To avoid wasting memory on small zones that only
+have a few names, chunks can be "shrunk" using `realloc()` to fit just
+the nodes that have been allocated.
+
+
+chunk metadata
+--------------
+
+The chunked memory layout is supported by a `base` array of pointers
+to the start of each chunk. A chunk number is just an index into this
+array.
+
+Alongside the `base` array is a `usage` array, indexed the same way.
+Instead of keeping track of individual nodes, the allocator just keeps
+a count of how many nodes have been allocated from a chunk, and how
+many were subsequently freed. The `used` count of the newest chunk
+also serves as the allocation point for the bump allocator, and the
+size of the chunk when it has been shrunk. This is why we increment
+the `free` count when a node is discarded, instead of decrementing the
+`used` count. The `usage` array also contains some fields used for
+chunk reclamation, about which more below.
+
+The `base` and `usage` arrays are separate because the `usage` array
+is only used by writers, and never shared with readers. The read-only
+hot path only needs the `base` array, so keeping it separate is more
+cache-friendly: less memory pressure on the read path and less
+interference from false sharing with write ops.
+
+Both arrays can have empty slots in which new chunks can be allocated;
+when a chunk is reclaimed its slot becomes empty. Additions and
+removals from the `base` array don't affect readers: they will not see
+a reference to a new chunk until after the writer commits, and the
+chunk reclamation machinery ensures that no readers depend on a chunk
+before it is deleted.
+
+When the arrays fill up they are reallocated. This is easy for the
+`usage` array because it is only accessed by writers, but the `base`
+array must be cloned, and the old version must be reclaimed later
+after it is no longer used by readers. For this reason the `base`
+array has a reference count.
+
+
+lightweight write transactions
+------------------------------
+
+"Write" transactions are intended for use when there is a heavy write
+load, such as a resolver cache. They minimize the amount of allocation
+by re-using the same chunk for the bump allocator across multiple
+transactions until it fills up.
+
+When a write (or update) is committed, a new packed read-only trie
+anchor is created. This contains a pointer to the `base` array and a
+32-bit reference to the trie's root node. The packed reader is stored
+in a pair of nodes in the current chunk, allocated by the bump
+allocator, so it does not need to be `malloc()`ed separately, and so
+the chunk reclamation machinery can also reclaim the `base` array when
+it is no longer in use.
+
+
+heavyweight update transactions
+-------------------------------
+
+By contrast, "update" transactions are intended to keep memory usage
+as low as possible between writes. On commit, the trie is compacted,
+and the bump allocator's chunk is shrunk to fit. When a transaction is
+opened, a fresh chunk must be allocated.
+
+Update transactions also support rollback, which requires making a
+copy of all the chunk metadata.
+
+
+lightweight query transactions
+------------------------------
+
+A "query" transaction dereferences a pointer to the current trie
+anchor and unpacks it into a `dns_qpread_t` object on the stack. There
+is no explicit interlocking with writers. Instead, query transactions
+must only be used inside an `isc_loop` callback function; the qp-trie
+memory reclamation machinery knows that the reader has completed when
+the callback returns to the loop. See `include/isc/qsbr.h` for more
+about how this works.
+
+
+heavyweight read-only snapshots
+-------------------------------
+
+A "snapshot" is for things like zone transfers that need a long-lived
+consistent view of a zone. When a snapshot is created, it includes a
+copy of the necessary parts of the `base` array. A qp-trie keeps a
+list of its snapshots, and there are flags in the `usage` array to
+mark which chunks are in use by snapshots and therefore cannot be
+reclaimed.
 
-In `lib/dns/qp_p.h`, the _"main qp-trie structures"_ hold information
-about a trie's chunks. Most of the chunk handling code is in the
-_"allocator"_ and _"chunk reclamation"_ sections in `lib/dns/qp.c`.
 
 
 lifecycle of value objects
@@ -609,103 +716,23 @@ adding special lookup functions that return whether leaf objects are
 mutable - see the "todo" in `include/dns/qp.h`.
 
 
-locking and RCU
----------------
+chunk cleanup
+-------------
 
-The Linux kernel has a collection of copy-on-write schemes collectively
-called read-copy-update; there is also https://liburcu.org/ for RCU in
-userspace. RCU is attractively speedy: readers can proceed without
-blocking at all; writers can proceed concurrently with readers, and
-updates can be committed without blocking. A commit is just a single
-atomic pointer update. RCU only requires writers to block when waiting
-for a "grace period" while older readers complete their critical
-sections, after which the writer can free memory that is no longer in
-use. Writers must also block on a mutex to ensure there is only one
-writer at a time.
+After a "write" or "update" transaction has committed, there can be a
+number of chunks that are no longer needed by the latest version of
+the trie, but still in use by readers accessing an older version.
+The qp-trie uses a QSBR callback to clean up chunks when they are no
+longer used at all.
 
-The qp-trie concurrency strategy is designed to be able to use RCU, but
-RCU is not required. Instead of RCU we can use a reader-writer lock.
-This requires readers to block when a writer commits, which (in RCU
-style) just requires an atomic pointer swap. The rwlock also changes
-when writers must block: commits must wait for readers to exit their
-critical sections, but there is no further waiting to be able to release
-memory.
+When reclaiming a chunk, we have to scan it for any remaining leaf
+nodes. When nodes are accessibly only to the writer, they are zeroed
+out when they are freed. If they are shared with readers, they must be
+left in place (though the `free` count in the usage array is still
+adjucted), and finally `detach()`ed when the chunk is reclaimed.
 
-In BIND, there are two kinds of reader: queries, which are relatiely
-quick, and zone transfers, which are relatively slow. BIND's dbversion
-machinery allows updates to proceed while there are long-running zone
-transfers. RCU supports this without further machinery, but a
-reader-writer lock needs some help so that long-running readers can
-avoid blocking writers.
-
-To avoid blocking updates, long-running readers can take a snapshot of a
-qp-trie, which only requires copying the allocator's chunk array. After
-a writer commits, it does not releases memory if there are any
-snapshots. Instead, chunks that are no longer needed by the latest
-version of the trie are stashed on a list to be released later,
-analogous to RCU waiting for a grace period.
-
-The locking occurs only in the functions under _"read-write
-transactions"_ and _"read-only transactions"_ in `lib/dns/qp.c`.
-
-
-immutability and copy-on-write
-------------------------------
-
-A qp-trie has a `generation` counter which is incremented by each
-write transaction. We keep track of which generation each chunk was
-created in; only chunks created in the current generation are
-mutable, because older chunks may be in use by concurrent readers.
-
-This logic is implemented by `chunk_alloc()` and `chunk_mutable()`
-in `lib/dns/qp.c`.
-
-The `make_twigs_mutable()` function ensures that a node is mutable,
-copying it if necessary.
-
-The chunk arrays are a mixture of mutable and immutable. Pointers to
-immutable chunks are immutable; new chunks can be assigned to unused
-entries; and entries are cleared when it is safe to reclaim the chunks
-they refer to. If the chunk arrays need to be expanded, the existing
-arrays are retained for use by readers, and the writer uses the
-expanded arrays (see `alloc_slow()`). The old arrays are cleaned up
-after the writer commits.
-
-
-update transactions
--------------------
-
-A typical heavy-weight `update` transaction comprises:
-
-  * make a copy of the chunk arrays in case we need to roll back
-
-  * get a freshly allocated chunk where new nodes or copied nodes
-    can be written
-
-  * make any changes that are required; nodes in old chunks are
-    copied to the new space first; new nodes are modified in place
-    to avoid creating unnecessary garbage
-
-  * when the updates are finished, and before committing, run the
-    garbage collector to clear out chunks that were fragmented by the
-    update
-
-  * shrink the allocation chunk to eliminate unused space
-
-  * commit the update by flipping the root pointer of the trie; this
-    is the only point that needs a multithreading interlock
-
-  * free any chunks that were emptied by the garbage collector
-
-A lightweight `write` transaction is similar, except that:
-
-  * rollback is not supported
-
-  * any existing allocation chunk is reused if possible
-
-  * the gabage collector is not run before committing
-
-  * the allocation chunk is not shrunk
+This chunk scan also cleans up old `base` arrays referred to by packed
+reader nodes.
 
 
 testing strategies
diff --git a/fuzz/Makefile.am b/fuzz/Makefile.am
index b7a7a9aafc..05904a97dc 100644
--- a/fuzz/Makefile.am
+++ b/fuzz/Makefile.am
@@ -6,6 +6,7 @@ AM_CFLAGS +=				\
 AM_CPPFLAGS +=				\
 	$(LIBISC_CFLAGS)		\
 	$(LIBDNS_CFLAGS)		\
+	$(LIBUV_CFLAGS)		\
 	-DFUZZDIR=\"$(abs_srcdir)\"
 
 AM_LDFLAGS +=				\
diff --git a/fuzz/dns_qp.c b/fuzz/dns_qp.c
index 64a88db8c1..3f0b674c14 100644
--- a/fuzz/dns_qp.c
+++ b/fuzz/dns_qp.c
@@ -16,6 +16,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 
+#include <isc/qsbr.h>
 #include <isc/random.h>
 #include <isc/refcount.h>
 #include <isc/rwlock.h>
diff --git a/lib/dns/include/dns/qp.h b/lib/dns/include/dns/qp.h
index 82088e2b88..4604d6cee0 100644
--- a/lib/dns/include/dns/qp.h
+++ b/lib/dns/include/dns/qp.h
@@ -42,30 +42,24 @@
  * value can be freed after it is no longer needed by readers using an old
  * version of the trie.
  *
- * For fast concurrent reads, call `dns_qpmulti_query()` to get a
- * `dns_qpread_t`. Readers can access a single version of the trie between
- * write commits. Most write activity is not blocked by readers, but reads
- * must finish before a write can commit (a read-write lock blocks
- * commits).
+ * For fast concurrent reads, call `dns_qpmulti_query()` to fill in a
+ * `dns_qpread_t`, which must be allocated on the stack. Readers can
+ * access a single version of the trie within the scope of an isc_loop
+ * thread (NOT an isc_work thread). We rely on the loop to bound the
+ * lifetime of a `dns_qpread_t`, instead of using locks. Readers are
+ * not blocked by any write activity, and vice versa.
  *
- * For long-running reads that need a stable view of the trie, while still
- * allow commits to proceed, call `dns_qpmulti_snapshot()` to get a
- * `dns_qpsnap_t`. It briefly gets the write mutex while creating the
- * snapshot, which requires allocating a copy of some of the trie's
- * metadata. A snapshot is for relatively heavy long-running read-only
- * operations such as zone transfers.
- *
- * While snapshots exist, a qp-trie cannot reclaim memory: it does not
- * retain detailed information about which memory is used by which
- * snapshots, so it pessimistically retains all memory that might be
- * used by old versions of the trie.
+ * For reads that need a stable view of the trie for multiple cycles
+ * of an isc_loop, or which can be used from any thread, call
+ * `dns_qpmulti_snapshot()` to get a `dns_qpsnap_t`. A snapshot is for
+ * relatively heavy long-running read-only operations such as zone
+ * transfers.
  *
  * You can start one read-write transaction at a time using
  * `dns_qpmulti_write()` or `dns_qpmulti_update()`. Either way, you
  * get a `dns_qp_t` that can be modified like a single-threaded trie,
  * without affecting other read-only query or snapshot users of the
- * `dns_qpmulti_t`. Committing a transaction only blocks readers
- * briefly when flipping the active readonly `dns_qp_t` pointer.
+ * `dns_qpmulti_t`.
  *
  * "Update" transactions are heavyweight. They allocate working memory to
  * hold modifications to the trie, and compact the trie before committing.
@@ -96,34 +90,68 @@
 typedef struct dns_qp dns_qp_t;
 
 /*%
- * A `dns_qpmulti_t` supports multi-version concurrent reads and transactional
- * modification.
+ * A `dns_qpmulti_t` supports multi-version wait-free concurrent reads
+ * and one transactional modification at a time.
  */
 typedef struct dns_qpmulti dns_qpmulti_t;
 
 /*%
- * A `dns_qpread_t` is a lightweight read-only handle on a `dns_qpmulti_t`.
+ * Read-only parts of a qp-trie.
+ *
+ * A `dns_qpreader_t` is the common prefix of the `dns_qpreadable`
+ * types, containing just the fields neded for the hot path.
+ *
+ * Ranty aside: annoyingly, C doesn't allow us to use a predeclared
+ * structure type as an anonymous struct member, so we have to use a
+ * macro. (GCC and Clang have the feature we want under -fms-extensions,
+ * but a non-standard extension won't make these declarations neater if
+ * we must also have a standard alternative.)
  */
-typedef struct dns_qpread dns_qpread_t;
+#define DNS_QPREADER_FIELDS                   \
+	uint32_t		    magic;    \
+	uint32_t		    root_ref; \
+	dns_qpbase_t		   *base;     \
+	void			   *uctx;     \
+	const struct dns_qpmethods *methods
+
+typedef struct dns_qpbase dns_qpbase_t; /* private */
+
+typedef struct dns_qpreader {
+	DNS_QPREADER_FIELDS;
+} dns_qpreader_t;
 
 /*%
- * A `dns_qpsnap_t` is a heavier read-only snapshot of a `dns_qpmulti_t`.
+ * A `dns_qpread_t` is a read-only handle on a `dns_qpmulti_t`.
+ * The caller provides space for it on the stack; it can be
+ * used by only one thread. As well as the `DNS_QPREADER_FIELDS`,
+ * it contains a thread ID to check for incorrect usage.
+ */
+typedef struct dns_qpread {
+	DNS_QPREADER_FIELDS;
+	uint32_t tid;
+} dns_qpread_t;
+
+/*%
+ * A `dns_qpsnap_t` is a read-only snapshot of a `dns_qpmulti_t`.
+ * It requires allocation and taking the `dns_qpmulti_t` mutex to
+ * create; it can be used from any thread.
  */
 typedef struct dns_qpsnap dns_qpsnap_t;
 
-/*
+/*%
  * The read-only qp-trie functions can work on either of the read-only
- * qp-trie types or the general-purpose read-write `dns_qp_t`. They
- * relies on the fact that all the `dns_qpreadable_t` structures start
- * with a `dns_qpread_t`.
+ * qp-trie types dns_qpsnap_t or dns_qpread_t, or the general-purpose
+ * read-write `dns_qp_t`. They rely on the fact that all the
+ * `dns_qpreadable_t` structures start with a `dns_qpreader_t`
  */
 typedef union dns_qpreadable {
-	dns_qpread_t *qpr;
-	dns_qpsnap_t *qps;
-	dns_qp_t     *qpt;
+	dns_qpreader_t *qp;
+	dns_qpread_t   *qpr;
+	dns_qpsnap_t   *qps;
+	dns_qp_t       *qpt;
 } dns_qpreadable_t __attribute__((__transparent_union__));
 
-#define dns_qpreadable_cast(qp) ((qp).qpr)
+#define dns_qpreader(qpr) ((qpr).qp)
 
 /*%
  * A trie lookup key is a small array, allocated on the stack during trie
@@ -136,9 +164,6 @@ typedef union dns_qpreadable {
  * common hostname character; otherwise unusual characters are escaped,
  * using two bytes in the key. So we allow keys to be up to 512 bytes.
  * (The actual max is (255 - 5) * 2 + 6 == 506)
- *
- * Every byte of a key must be greater than 0 and less than 48. Elements
- * after the end of the key are treated as having the value 1.
  */
 typedef uint8_t dns_qpkey_t[512];
 
@@ -154,7 +179,9 @@ typedef uint8_t dns_qpkey_t[512];
  *
  * The `attach` and `detach` methods adjust reference counts on value
  * objects. They support copy-on-write and safe memory reclamation
- * needed for multi-version concurrency.
+ * needed for multi-version concurrency. The methods are only called
+ * when the `dns_qpmulti_t` mutex is held, so they only need to use
+ * atomic ops if the refcounts are used by code other than the qp-trie.
  *
  * Note: When a value object reference count is greater than one, the
  * object is in use by concurrent readers so it must not be modified. A
@@ -237,15 +264,17 @@ dns_qp_destroy(dns_qp_t **qptp);
  */
 
 void
-dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
+dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
+		   const dns_qpmethods_t *methods, void *uctx,
 		   dns_qpmulti_t **qpmp);
 /*%<
  * Create a multi-threaded qp-trie.
  *
  * Requires:
  * \li  `mctx` is a pointer to a valid memory context.
- * \li  all the methods are non-NULL
+ * \li  'loopmgr' is a valid loop manager.
  * \li  `qpmp != NULL && *qpmp == NULL`
+ * \li  all the methods are non-NULL
  *
  * Ensures:
  * \li  `*qpmp` is a pointer to a valid multi-threaded qp-trie
@@ -400,7 +429,7 @@ dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival);
  * Requires:
  * \li  `qp` is a pointer to a valid qp-trie
  * \li  `pval != NULL`
- * \li  `alignof(pval) > 1`
+ * \li  `alignof(pval) >= 4`
  *
  * Returns:
  * \li  ISC_R_EXISTS if the trie already has a leaf with the same key
@@ -440,34 +469,32 @@ dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name);
  */
 
 void
-dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t **qprp);
+dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qpr);
 /*%<
  * Start a lightweight (brief) read-only transaction
  *
- * This takes a read lock on `multi`s rwlock that prevents
- * transactions from committing.
+ * The `dns_qpmulti_query()` function must be called from an isc_loop
+ * thread and its 'qpr' argument must be allocated on the stack.
  *
  * Requires:
  * \li  `multi` is a pointer to a valid multi-threaded qp-trie
- * \li  `qprp != NULL`
- * \li  `*qprp == NULL`
+ * \li  `qpr != NULL`
  *
  * Returns:
- * \li  `*qprp` is a pointer to a valid read-only qp-trie handle
+ * \li  `qpr` is a valid read-only qp-trie handle
  */
 
 void
-dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t **qprp);
+dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qpr);
 /*%<
- * End a lightweight read transaction, i.e. release read lock
+ * End a lightweight read transaction.
  *
  * Requires:
  * \li  `multi` is a pointer to a valid multi-threaded qp-trie
- * \li  `qprp != NULL`
- * \li  `*qprp` is a read-only qp-trie handle obtained from `multi`
+ * \li  `qpr` is a read-only qp-trie handle obtained from `multi`
  *
  * Returns:
- * \li  `*qprp == NULL`
+ * \li  `qpr` is invalidated
  */
 
 void
@@ -478,7 +505,7 @@ dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp);
  * This function briefly takes and releases the modification mutex
  * while allocating a copy of the trie's metadata. While the snapshot
  * exists it does not interfere with other read-only or read-write
- * transactions on the trie, except that memory cannot be reclaimed.
+ * transactions on the trie.
  *
  * Requires:
  * \li  `multi` is a pointer to a valid multi-threaded qp-trie
@@ -494,10 +521,6 @@ dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp);
 /*%<
  * End a heavyweight read transaction
  *
- * If this is the last remaining snapshot belonging to `multi` then
- * this function takes the modification mutex in order to free() any
- * memory that is no longer in use.
- *
  * Requires:
  * \li  `multi` is a pointer to a valid multi-threaded qp-trie
  * \li  `qpsp != NULL`
@@ -538,6 +561,12 @@ dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp);
  * for a large trie that gets frequent small writes, such as a DNS
  * cache.
  *
+ * A sequence of lightweight write transactions can accumulate
+ * garbage that the automatic compact/recycle cannot reclaim.
+ * To reclaim this space, you can use the `dns_qp_memusage
+ * fragmented` flag to trigger a call to dns_qp_compact(), or you
+ * can use occasional update transactions to compact the trie.
+ *
  * During the transaction, the modification mutex is held.
  *
  * Requires:
@@ -554,10 +583,9 @@ dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp);
 /*%<
  * Complete a modification transaction
  *
- * The commit itself only requires flipping the read pointer inside
- * `multi` from the old version of the trie to the new version. This
- * function takes a write lock on `multi`s rwlock just long enough to
- * flip the pointer. This briefly blocks `query` readers.
+ * Apart from memory management logistics, the commit itself only
+ * requires flipping the read pointer inside `multi` from the old
+ * version of the trie to the new version. Readers are not blocked.
  *
  * This function releases the modification mutex after the post-commit
  * memory reclamation is completed.
diff --git a/lib/dns/qp.c b/lib/dns/qp.c
index a8cb209d9d..b54a9e4ba7 100644
--- a/lib/dns/qp.c
+++ b/lib/dns/qp.c
@@ -28,12 +28,15 @@
 
 #include <isc/atomic.h>
 #include <isc/buffer.h>
+#include <isc/loop.h>
 #include <isc/magic.h>
 #include <isc/mem.h>
 #include <isc/mutex.h>
+#include <isc/qsbr.h>
 #include <isc/refcount.h>
 #include <isc/result.h>
 #include <isc/rwlock.h>
+#include <isc/tid.h>
 #include <isc/time.h>
 #include <isc/types.h>
 #include <isc/util.h>
@@ -74,23 +77,22 @@ static atomic_uint_fast64_t rollback_time;
 #define LOG_STATS(...)
 #endif
 
-#define PRItime " %" PRIu64 " us "
-
 #if DNS_QP_TRACE
 /*
  * TRACE is generally used in allocation-related functions so it doesn't
  * trace very high-frequency ops
  */
-#define TRACE(fmt, ...)                                                       \
-	if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) {                   \
-		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,             \
-			      DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7),             \
-			      "%s:%d:%s(qp %p uctx \"%s\" gen %u): " fmt,     \
-			      __FILE__, __LINE__, __func__, qp, TRIENAME(qp), \
-			      qp->generation, ##__VA_ARGS__);                 \
-	} else                                                                \
-		do {                                                          \
-		} while (0)
+#define TRACE(fmt, ...)                                                        \
+	do {                                                                   \
+		if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) {            \
+			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,      \
+				      DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7),      \
+				      "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \
+				      __FILE__, __LINE__, __func__, qp,        \
+				      qp ? TRIENAME(qp) : "(null)", isc_tid(), \
+				      ##__VA_ARGS__);                          \
+		}                                                              \
+	} while (0)
 #else
 #define TRACE(...)
 #endif
@@ -138,7 +140,8 @@ static void
 initialize_bits_for_byte(void) ISC_CONSTRUCTOR;
 
 /*
- * The bit positions have to be between SHIFT_BITMAP and SHIFT_OFFSET.
+ * The bit positions for bytes inside labels have to be between
+ * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.)
  *
  * Each byte range in between common hostname characters has a different
  * escape character, to preserve the correct lexical order.
@@ -327,20 +330,16 @@ chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) {
 }
 
 static void
-write_protect(dns_qp_t *qp, void *ptr, bool readonly) {
+write_protect(dns_qp_t *qp, qp_chunk_t chunk) {
 	if (qp->write_protect) {
-		int prot = readonly ? PROT_READ : PROT_READ | PROT_WRITE;
-		size_t size = chunk_size_raw();
-		RUNTIME_CHECK(mprotect(ptr, size, prot) >= 0);
-	}
-}
-
-static void
-write_protect_all(dns_qp_t *qp) {
-	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
-		if (chunk != qp->bump && qp->base[chunk] != NULL) {
-			write_protect(qp, qp->base[chunk], true);
+		/* see transaction_open() wrt this special case */
+		if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) {
+			return;
 		}
+		TRACE("chunk %u", chunk);
+		void *ptr = qp->base->ptr[chunk];
+		size_t size = chunk_size_raw();
+		RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0);
 	}
 }
 
@@ -351,48 +350,15 @@ write_protect_all(dns_qp_t *qp) {
 
 #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size)
 
-#define write_protect(qp, chunk, readonly)
-#define write_protect_all(qp)
+#define write_protect(qp, chunk)
 
 #endif
 
-static void *
-clone_array(isc_mem_t *mctx, void *oldp, size_t oldsz, size_t newsz,
-	    size_t elemsz) {
-	uint8_t *newp = NULL;
-
-	INSIST(oldsz <= newsz);
-	INSIST(newsz < UINT32_MAX);
-	INSIST(elemsz < UINT32_MAX);
-	INSIST(((uint64_t)newsz) * ((uint64_t)elemsz) <= UINT32_MAX);
-
-	/* sometimes we clone an array before it has been populated */
-	if (newsz > 0) {
-		oldsz *= elemsz;
-		newsz *= elemsz;
-		newp = isc_mem_allocate(mctx, newsz);
-		if (oldsz > 0) {
-			memmove(newp, oldp, oldsz);
-		}
-		memset(newp + oldsz, 0, newsz - oldsz);
-	}
-	return (newp);
-}
-
 /***********************************************************************
  *
  *  allocator
  */
 
-/*
- * We can mutate a chunk if it was allocated in the current generation.
- * This might not be true for the `bump` chunk when it is reused.
- */
-static inline bool
-chunk_immutable(dns_qp_t *qp, qp_chunk_t chunk) {
-	return (qp->usage[chunk].generation != qp->generation);
-}
-
 /*
  * When we reuse the bump chunk across multiple write transactions,
  * it can have an immutable prefix and a mutable suffix.
@@ -404,7 +370,7 @@ cells_immutable(dns_qp_t *qp, qp_ref_t ref) {
 	if (chunk == qp->bump) {
 		return (cell < qp->fender);
 	} else {
-		return (chunk_immutable(qp, chunk));
+		return (qp->usage[chunk].immutable);
 	}
 }
 
@@ -413,81 +379,71 @@ cells_immutable(dns_qp_t *qp, qp_ref_t ref) {
  */
 static qp_ref_t
 chunk_alloc(dns_qp_t *qp, qp_chunk_t chunk, qp_weight_t size) {
-	REQUIRE(qp->base[chunk] == NULL);
-	REQUIRE(qp->usage[chunk].generation == 0);
-	REQUIRE(qp->usage[chunk].used == 0);
-	REQUIRE(qp->usage[chunk].free == 0);
+	INSIST(qp->base->ptr[chunk] == NULL);
+	INSIST(qp->usage[chunk].used == 0);
+	INSIST(qp->usage[chunk].free == 0);
 
-	qp->base[chunk] = chunk_get_raw(qp);
-	qp->usage[chunk].generation = qp->generation;
-	qp->usage[chunk].used = size;
-	qp->usage[chunk].free = 0;
+	qp->base->ptr[chunk] = chunk_get_raw(qp);
+	qp->usage[chunk] = (qp_usage_t){ .exists = true, .used = size };
 	qp->used_count += size;
 	qp->bump = chunk;
 	qp->fender = 0;
 
-	TRACE("chunk %u gen %u base %p", chunk, qp->usage[chunk].generation,
-	      qp->base[chunk]);
+	if (qp->write_protect) {
+		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
+	}
 	return (make_ref(chunk, 0));
 }
 
-static void
-free_chunk_arrays(dns_qp_t *qp) {
-	TRACE("base %p usage %p max %u", qp->base, qp->usage, qp->chunk_max);
-	/*
-	 * They should both be null or both non-null; if they are out of sync,
-	 * this will intentionally trigger an assert in `isc_mem_free()`.
-	 */
-	if (qp->base != NULL || qp->usage != NULL) {
-		isc_mem_free(qp->mctx, qp->base);
-		isc_mem_free(qp->mctx, qp->usage);
-	}
-}
-
 /*
- * This is used both to grow the arrays when they fill up, and to copy them at
- * the start of an update transaction. We check if the old arrays are in use by
- * readers, in which case we will do safe memory reclamation later.
+ * This is used to grow the chunk arrays when they fill up. If the old
+ * base array is in use by readers, we must make a clone, otherwise we
+ * can reallocate in place.
+ *
+ * The isc_refcount_init() and qpbase_unref() in this function are a pair.
  */
 static void
-clone_chunk_arrays(dns_qp_t *qp, qp_chunk_t newmax) {
-	qp_chunk_t oldmax;
-	void *base, *usage;
+realloc_chunk_arrays(dns_qp_t *qp, qp_chunk_t newmax) {
+	size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max;
+	size_t newptrs = sizeof(qp->base->ptr[0]) * newmax;
+	size_t allbytes = sizeof(dns_qpbase_t) + newptrs;
+
+	if (qp->base == NULL || qpbase_unref(qp)) {
+		qp->base = isc_mem_reallocate(qp->mctx, qp->base, allbytes);
+	} else {
+		dns_qpbase_t *oldbase = qp->base;
+		qp->base = isc_mem_allocate(qp->mctx, allbytes);
+		memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs);
+	}
+	memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs);
+	isc_refcount_init(&qp->base->refcount, 1);
+
+	/* usage array is exclusive to the writer */
+	size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max;
+	size_t newusage = sizeof(qp->usage[0]) * newmax;
+	qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage);
+	memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage);
 
-	oldmax = qp->chunk_max;
 	qp->chunk_max = newmax;
 
-	base = clone_array(qp->mctx, qp->base, oldmax, newmax,
-			   sizeof(*qp->base));
-	usage = clone_array(qp->mctx, qp->usage, oldmax, newmax,
-			    sizeof(*qp->usage));
-
-	if (qp->shared_arrays) {
-		qp->shared_arrays = false;
-	} else {
-		free_chunk_arrays(qp);
-	}
-	qp->base = base;
-	qp->usage = usage;
-
-	TRACE("base %p usage %p max %u", qp->base, qp->usage, qp->chunk_max);
+	TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max);
 }
 
 /*
  * There was no space in the bump chunk, so find a place to put a fresh
- * chunk in the chunk table, then allocate some twigs from it.
+ * chunk in the chunk arrays, then allocate some twigs from it.
  */
 static qp_ref_t
 alloc_slow(dns_qp_t *qp, qp_weight_t size) {
 	qp_chunk_t chunk;
 
 	for (chunk = 0; chunk < qp->chunk_max; chunk++) {
-		if (qp->base[chunk] == NULL) {
+		if (!qp->usage[chunk].exists) {
 			return (chunk_alloc(qp, chunk, size));
 		}
 	}
 	ENSURE(chunk == qp->chunk_max);
-	clone_chunk_arrays(qp, GROWTH_FACTOR(chunk));
+	realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk));
 	return (chunk_alloc(qp, chunk, size));
 }
 
@@ -552,7 +508,7 @@ free_twigs(dns_qp_t *qp, qp_ref_t twigs, qp_weight_t size) {
 static void
 attach_twigs(dns_qp_t *qp, qp_node_t *twigs, qp_weight_t size) {
 	for (qp_weight_t pos = 0; pos < size; pos++) {
-		if (!is_branch(&twigs[pos])) {
+		if (node_tag(&twigs[pos]) == LEAF_TAG) {
 			attach_leaf(qp, &twigs[pos]);
 		}
 	}
@@ -572,63 +528,62 @@ chunk_usage(dns_qp_t *qp, qp_chunk_t chunk) {
 }
 
 /*
- * When a chunk is being recycled after a long-running read transaction,
- * or after a rollback, we need to detach any leaves that remain.
+ * We remove each empty chunk from the total counts when the chunk is
+ * freed, or when it is scheduled for safe memory reclamation. We check
+ * the chunk's phase to avoid discounting it twice in the latter case.
  */
 static void
-chunk_free(dns_qp_t *qp, qp_chunk_t chunk) {
-	TRACE("chunk %u gen %u base %p", chunk, qp->usage[chunk].generation,
-	      qp->base[chunk]);
-
-	qp_node_t *n = qp->base[chunk];
-	write_protect(qp, n, false);
-
-	for (qp_cell_t count = qp->usage[chunk].used; count > 0; count--, n++) {
-		if (!is_branch(n) && leaf_pval(n) != NULL) {
-			detach_leaf(qp, n);
-		}
+chunk_discount(dns_qp_t *qp, qp_chunk_t chunk) {
+	if (qp->usage[chunk].phase == 0) {
+		INSIST(qp->used_count >= qp->usage[chunk].used);
+		INSIST(qp->free_count >= qp->usage[chunk].free);
+		qp->used_count -= qp->usage[chunk].used;
+		qp->free_count -= qp->usage[chunk].free;
 	}
-	chunk_free_raw(qp, qp->base[chunk]);
-
-	INSIST(qp->used_count >= qp->usage[chunk].used);
-	INSIST(qp->free_count >= qp->usage[chunk].free);
-	qp->used_count -= qp->usage[chunk].used;
-	qp->free_count -= qp->usage[chunk].free;
-	qp->usage[chunk].used = 0;
-	qp->usage[chunk].free = 0;
-	qp->usage[chunk].generation = 0;
-	qp->base[chunk] = NULL;
 }
 
 /*
- * If we have any nodes on hold during a transaction, we must leave
- * immutable chunks intact. As the last stage of safe memory reclamation,
- * we can clear the hold counter and recycle all empty chunks (even from a
- * nominally read-only `dns_qp_t`) because nothing refers to them any more.
- *
- * If we are using RCU, this can be called by `defer_rcu()` or `call_rcu()`
- * to clean up after readers have left their critical sections.
+ * When a chunk is being recycled, we need to detach any leaves that
+ * remain, and free any `base` arrays that have been marked as unused.
+ */
+static void
+chunk_free(dns_qp_t *qp, qp_chunk_t chunk) {
+	if (qp->write_protect) {
+		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
+	}
+
+	qp_node_t *n = qp->base->ptr[chunk];
+	for (qp_cell_t count = qp->usage[chunk].used; count > 0; count--, n++) {
+		if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) {
+			detach_leaf(qp, n);
+		} else if (count > 1 && reader_valid(n)) {
+			dns_qpreader_t qpr;
+			unpack_reader(&qpr, n);
+			/* pairs with dns_qpmulti_commit() */
+			if (qpbase_unref(&qpr)) {
+				isc_mem_free(qp->mctx, qpr.base);
+			}
+		}
+	}
+	chunk_discount(qp, chunk);
+	chunk_free_raw(qp, qp->base->ptr[chunk]);
+	qp->base->ptr[chunk] = NULL;
+	qp->usage[chunk] = (qp_usage_t){};
+}
+
+/*
+ * Free any chunks that we can while a trie is in use.
  */
 static void
 recycle(dns_qp_t *qp) {
-	unsigned int live = 0;
-	unsigned int keep = 0;
 	unsigned int free = 0;
 
-	TRACE("expect to free %u cells -> %u chunks",
-	      (qp->free_count - qp->hold_count),
-	      (qp->free_count - qp->hold_count) / QP_CHUNK_SIZE);
-
 	isc_nanosecs_t start = isc_time_monotonic();
 
 	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
-		if (qp->base[chunk] == NULL) {
-			continue;
-		} else if (chunk_usage(qp, chunk) > 0 || chunk == qp->bump) {
-			live++;
-		} else if (chunk_immutable(qp, chunk) && qp->hold_count > 0) {
-			keep++;
-		} else {
+		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
+		    qp->usage[chunk].exists && !qp->usage[chunk].immutable)
+		{
 			chunk_free(qp, chunk);
 			free++;
 		}
@@ -637,13 +592,155 @@ recycle(dns_qp_t *qp) {
 	isc_nanosecs_t time = isc_time_monotonic() - start;
 	atomic_fetch_add_relaxed(&recycle_time, time);
 
-	LOG_STATS("qp recycle" PRItime "live %u keep %u free %u chunks", time,
-		  live, keep, free);
+	LOG_STATS("qp recycle" PRItime "free %u chunks", time, free);
 	LOG_STATS("qp recycle after leaf %u live %u used %u free %u hold %u",
 		  qp->leaf_count, qp->used_count - qp->free_count,
 		  qp->used_count, qp->free_count, qp->hold_count);
 }
 
+/*
+ * At the end of a transaction, mark empty but immutable chunks for
+ * reclamation later. Returns true when chunks need reclaiming later.
+ */
+static bool
+defer_chunk_reclamation(dns_qp_t *qp, isc_qsbr_phase_t phase) {
+	unsigned int reclaim = 0;
+
+	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
+		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
+		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
+		    qp->usage[chunk].phase == 0)
+		{
+			chunk_discount(qp, chunk);
+			qp->usage[chunk].phase = phase;
+			reclaim++;
+		}
+	}
+
+	LOG_STATS("qp will reclaim %u chunks in phase %u", reclaim, phase);
+
+	return (reclaim > 0);
+}
+
+static bool
+reclaim_chunks(dns_qp_t *qp, isc_qsbr_phase_t phase) {
+	unsigned int free = 0;
+	bool more = false;
+
+	isc_nanosecs_t start = isc_time_monotonic();
+
+	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
+		if (qp->usage[chunk].phase == phase) {
+			if (qp->usage[chunk].snapshot) {
+				/* cleanup when snapshot is destroyed */
+				qp->usage[chunk].snapfree = true;
+			} else {
+				chunk_free(qp, chunk);
+				free++;
+			}
+		} else if (qp->usage[chunk].phase != 0) {
+			/*
+			 * We need to reclaim more of this trie's memory
+			 * on a later qsbr callback.
+			 */
+			more = true;
+		}
+	}
+
+	isc_nanosecs_t time = isc_time_monotonic() - start;
+	recycle_time += time;
+
+	LOG_STATS("qp reclaim" PRItime "phase %u free %u chunks", time, phase,
+		  free);
+	LOG_STATS("qp reclaim after leaf %u live %u used %u free %u hold %u",
+		  qp->leaf_count, qp->used_count - qp->free_count,
+		  qp->used_count, qp->free_count, qp->hold_count);
+
+	return (more);
+}
+
+/*
+ * List of `dns_qpmulti_t`s that have chunks to be reclaimed.
+ */
+static ISC_ASTACK(dns_qpmulti_t) qsbr_work;
+
+/*
+ * When a grace period has passed, this function reclaims any unused memory.
+ */
+static void
+qp_qsbr_reclaimer(isc_qsbr_phase_t phase) {
+	ISC_STACK(dns_qpmulti_t) drain = ISC_ASTACK_TO_STACK(qsbr_work);
+	while (!ISC_STACK_EMPTY(drain)) {
+		/* lock before pop */
+		dns_qpmulti_t *multi = ISC_STACK_TOP(drain);
+		INSIST(QPMULTI_VALID(multi));
+		LOCK(&multi->mutex);
+		ISC_STACK_POP(drain, cleanup);
+		if (multi->writer.destroy) {
+			UNLOCK(&multi->mutex);
+			dns_qpmulti_destroy(&multi);
+		} else {
+			if (reclaim_chunks(&multi->writer, phase)) {
+				/* more to do next time */
+				ISC_ASTACK_PUSH(qsbr_work, multi, cleanup);
+			}
+			UNLOCK(&multi->mutex);
+		}
+	}
+}
+
+/*
+ * Register `qp_qsbr_reclaimer()` with QSBR at startup.
+ */
+static void
+qp_qsbr_register(void) ISC_CONSTRUCTOR;
+static void
+qp_qsbr_register(void) {
+	isc_qsbr_register(qp_qsbr_reclaimer);
+}
+
+/*
+ * When a snapshot is destroyed, clean up chunks that need free()ing
+ * and are not used by any remaining snapshots.
+ */
+static void
+marksweep_chunks(dns_qpmulti_t *multi) {
+	unsigned int free = 0;
+
+	isc_nanosecs_t start = isc_time_monotonic();
+
+	dns_qp_t *qpw = &multi->writer;
+
+	for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL;
+	     qps = ISC_LIST_NEXT(qps, link))
+	{
+		for (qp_chunk_t chunk = 0; chunk < qps->chunk_max; chunk++) {
+			if (qps->base->ptr[chunk] != NULL) {
+				INSIST(qps->base->ptr[chunk] ==
+				       qpw->base->ptr[chunk]);
+				qpw->usage[chunk].snapmark = true;
+			}
+		}
+	}
+
+	for (qp_chunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
+		qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark;
+		qpw->usage[chunk].snapmark = false;
+		if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) {
+			chunk_free(qpw, chunk);
+			free++;
+		}
+	}
+
+	isc_nanosecs_t time = isc_time_monotonic() - start;
+	recycle_time += time;
+
+	LOG_STATS("qp marksweep" PRItime "free %u chunks", time, free);
+	LOG_STATS("qp marksweep after leaf %u live %u used %u free %u hold %u",
+		  qpw->leaf_count, qpw->used_count - qpw->free_count,
+		  qpw->used_count, qpw->free_count, qpw->hold_count);
+}
+
 /***********************************************************************
  *
  *  garbage collector
@@ -675,11 +772,20 @@ evacuate(dns_qp_t *qp, qp_node_t *n) {
 }
 
 /*
- * Immutable nodes need copy-on-write. As we walk down the trie finding
- * the right place to modify, make_twigs_mutable() is called to ensure
- * that shared nodes on the path from the root are copied to a mutable
- * chunk.
+ * Immutable nodes need copy-on-write. As we walk down the trie finding the
+ * right place to modify, make_root_mutable() and make_twigs_mutable()
+ * are called to ensure that immutable nodes on the path from the root are
+ * copied to a mutable chunk.
  */
+
+static inline qp_node_t *
+make_root_mutable(dns_qp_t *qp) {
+	if (cells_immutable(qp, qp->root_ref)) {
+		qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp));
+	}
+	return (ref_ptr(qp, qp->root_ref));
+}
+
 static inline void
 make_twigs_mutable(dns_qp_t *qp, qp_node_t *n) {
 	if (cells_immutable(qp, branch_twigs_ref(n))) {
@@ -746,9 +852,8 @@ compact(dns_qp_t *qp) {
 		alloc_reset(qp);
 	}
 
-	if (is_branch(&qp->root)) {
-		qp->root = make_node(branch_index(&qp->root),
-				     compact_recursive(qp, &qp->root));
+	if (qp->leaf_count > 0) {
+		qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp));
 	}
 	qp->compact_all = false;
 
@@ -769,27 +874,6 @@ dns_qp_compact(dns_qp_t *qp, bool all) {
 	recycle(qp);
 }
 
-static void
-auto_compact_recycle(dns_qp_t *qp) {
-	compact(qp);
-	recycle(qp);
-	/*
-	 * This shouldn't happen if the garbage collector is
-	 * working correctly. We can recover at the cost of some
-	 * time and space, but recovery should be cheaper than
-	 * letting compact+recycle fail repeatedly.
-	 */
-	if (QP_MAX_GARBAGE(qp)) {
-		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
-			      DNS_LOGMODULE_QP, ISC_LOG_NOTICE,
-			      "qp %p uctx \"%s\" compact/recycle "
-			      "failed to recover any space, "
-			      "scheduling a full compaction",
-			      qp, TRIENAME(qp));
-		qp->compact_all = true;
-	}
-}
-
 /*
  * Free some twigs and (if they were destroyed immediately so that the
  * result from QP_MAX_GARBAGE can change) compact the trie if necessary.
@@ -797,7 +881,7 @@ auto_compact_recycle(dns_qp_t *qp) {
  * This is called by the trie modification API entry points. The
  * free_twigs() function requires the caller to attach or detach any
  * leaves as necessary. Callers of squash_twigs() satisfy this
- * requirement by calling cow_twigs().
+ * requirement by calling make_twigs_mutable().
  *
  * Aside: In typical garbage collectors, compaction is triggered when
  * the allocator runs out of space. But that is because typical garbage
@@ -814,7 +898,23 @@ static inline bool
 squash_twigs(dns_qp_t *qp, qp_ref_t twigs, qp_weight_t size) {
 	bool destroyed = free_twigs(qp, twigs, size);
 	if (destroyed && QP_MAX_GARBAGE(qp)) {
-		auto_compact_recycle(qp);
+		compact(qp);
+		recycle(qp);
+		/*
+		 * This shouldn't happen if the garbage collector is
+		 * working correctly. We can recover at the cost of some
+		 * time and space, but recovery should be cheaper than
+		 * letting compact+recycle fail repeatedly.
+		 */
+		if (QP_MAX_GARBAGE(qp)) {
+			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
+				      DNS_LOGMODULE_QP, ISC_LOG_NOTICE,
+				      "qp %p uctx \"%s\" compact/recycle "
+				      "failed to recover any space, "
+				      "scheduling a full compaction",
+				      qp, TRIENAME(qp));
+			qp->compact_all = true;
+		}
 	}
 	return (destroyed);
 }
@@ -848,15 +948,18 @@ dns_qp_memusage(dns_qp_t *qp) {
 	qp->hold_count = memusage.hold;
 
 	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
-		if (qp->base[chunk] != NULL) {
+		if (qp->base->ptr[chunk] != NULL) {
 			memusage.chunk_count += 1;
 		}
 	}
 
-	/* slight over-estimate if chunks have been shrunk */
+	/*
+	 * XXXFANF does not subtract chunks that have been shrunk,
+	 * and does not count unreclaimed dns_qpbase_t objects
+	 */
 	memusage.bytes = memusage.chunk_count * QP_CHUNK_BYTES +
-			 qp->chunk_max * sizeof(*qp->base) +
-			 qp->chunk_max * sizeof(*qp->usage);
+			 qp->chunk_max * sizeof(qp->base->ptr[0]) +
+			 qp->chunk_max * sizeof(qp->usage[0]);
 
 	return (memusage);
 }
@@ -895,34 +998,39 @@ dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p,
 
 static dns_qp_t *
 transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) {
-	dns_qp_t *qp, *old;
+	dns_qp_t *qp;
 
 	REQUIRE(QPMULTI_VALID(multi));
 	REQUIRE(qptp != NULL && *qptp == NULL);
 
 	LOCK(&multi->mutex);
 
-	old = multi->read;
-	qp = write_phase(multi);
+	qp = &multi->writer;
 
-	INSIST(QP_VALID(old));
-	INSIST(!QP_VALID(qp));
+	INSIST(QP_VALID(qp));
 
 	/*
-	 * prepare for copy-on-write
+	 * Mark existing chunks as immutable.
+	 *
+	 * Aside: The bump chunk is special: in a series of write
+	 * transactions the bump chunk is reused; the first part (up
+	 * to fender) is immutable, the rest mutable. But we set its
+	 * immutable flag so that when the bump chunk fills up, the
+	 * first part continues to be treated as immutable. (And the
+	 * rest of the chunk too, but that's OK.)
 	 */
-	*qp = *old;
-	qp->shared_arrays = true;
-	qp->hold_count = qp->free_count;
-
-	/*
-	 * Start a new generation, and ensure it isn't zero because we
-	 * want to avoid confusion with unset qp->usage structures.
-	 */
-	if (++qp->generation == 0) {
-		++qp->generation;
+	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
+		if (qp->usage[chunk].exists) {
+			qp->usage[chunk].immutable = true;
+			write_protect(qp, chunk);
+		}
 	}
 
+	/*
+	 * Ensure QP_MAX_GARBAGE() ignores free space in immutable chunks.
+	 */
+	qp->hold_count = qp->free_count;
+
 	*qptp = qp;
 	return (qp);
 }
@@ -930,122 +1038,132 @@ transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) {
 /*
  * a write is light
  *
- * We need to ensure we alloce from a fresh chunk if the last transaction
+ * We need to ensure we allocate from a fresh chunk if the last transaction
  * shrunk the bump chunk; but usually in a sequence of write transactions
- * we just mark the point where we started this generation.
+ * we just put `fender` at the point where we started this generation.
  *
- * (Instead of keeping the previous transaction's mode, I considered
- * forcing allocation into the slow path by fiddling with the bump
- * chunk's usage counters. But that is troublesome because
- * `chunk_free_now()` needs to know how much of the chunk to scan.)
+ * (Aside: Instead of keeping the previous transaction's mode, I
+ * considered forcing allocation into the slow path by fiddling with
+ * the bump chunk's usage counters. But that is troublesome because
+ * `chunk_free()` needs to know how much of the chunk to scan.)
  */
 void
 dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) {
 	dns_qp_t *qp = transaction_open(multi, qptp);
 	TRACE("");
 
-	if (qp->transaction_mode == QP_UPDATE) {
-		alloc_reset(qp);
-	} else {
+	if (qp->transaction_mode == QP_WRITE) {
 		qp->fender = qp->usage[qp->bump].used;
+	} else {
+		alloc_reset(qp);
 	}
-
 	qp->transaction_mode = QP_WRITE;
-	write_protect_all(qp);
 }
 
 /*
- * an update is heavy
+ * an update is heavier
  *
- * Make sure we have copies of all usage counters so that we can rollback.
- * Do this before allocating a bump chunk so that all chunks allocated in
- * this transaction are in the fresh chunk arrays. (If the existing chunk
- * arrays happen to be full we might immediately clone them a second time.
- * Probably not worth worrying about?)
+ * We always reset the allocator to the start of a fresh chunk,
+ * because the previous transaction was probably an update that shrunk
+ * the bump chunk. It simplifies rollback because `fender` is always zero.
+ *
+ * To rollback a transaction, we need to reset all the allocation
+ * counters to their previous state, in particular we need to un-free
+ * any nodes that were copied to make them mutable. This means we need
+ * to make a copy of basically the whole `dns_qp_t writer`: everything
+ * but the chunks holding the trie nodes.
+ *
+ * We do most of the transaction setup before creating the rollback
+ * state so that after rollback we have a correct idea of which chunks
+ * are immutable, and so we have the correct transaction mode to make
+ * the next transaction allocate a new bump chunk. The exception is
+ * resetting the allocator, which we do after creating the rollback
+ * state; if this transaction is rolled back then the next transaction
+ * will start from the rollback state and also reset the allocator as
+ * one of its first actions.
  */
 void
 dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) {
 	dns_qp_t *qp = transaction_open(multi, qptp);
 	TRACE("");
 
-	clone_chunk_arrays(qp, qp->chunk_max);
-	alloc_reset(qp);
-
 	qp->transaction_mode = QP_UPDATE;
-	write_protect_all(qp);
+
+	dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback));
+	memmove(rollback, qp, sizeof(*rollback));
+	/* can be uninitialized on the first transaction */
+	if (rollback->base != NULL) {
+		/* paired with either _commit() or _rollback() */
+		isc_refcount_increment(&rollback->base->refcount);
+		size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max;
+		rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes);
+		memmove(rollback->usage, qp->usage, usage_bytes);
+	}
+	INSIST(multi->rollback == NULL);
+	multi->rollback = rollback;
+
+	alloc_reset(qp);
 }
 
 void
 dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) {
-	dns_qp_t *qp, *old;
-
 	REQUIRE(QPMULTI_VALID(multi));
-	REQUIRE(qptp != NULL);
-	REQUIRE(*qptp == write_phase(multi));
-
-	old = multi->read;
-	qp = write_phase(multi);
+	REQUIRE(qptp != NULL && *qptp == &multi->writer);
+	REQUIRE(multi->writer.transaction_mode == QP_WRITE ||
+		multi->writer.transaction_mode == QP_UPDATE);
 
+	dns_qp_t *qp = *qptp;
 	TRACE("");
 
 	if (qp->transaction_mode == QP_UPDATE) {
-		qp_chunk_t c;
-		size_t bytes;
-
-		compact(qp);
-		c = qp->bump;
-		bytes = qp->usage[c].used * sizeof(qp_node_t);
-		if (bytes == 0) {
-			chunk_free(qp, c);
-		} else {
-			qp->base[c] = chunk_shrink_raw(qp, qp->base[c], bytes);
+		INSIST(multi->rollback != NULL);
+		/* paired with dns_qpmulti_update() */
+		if (qpbase_unref(multi->rollback)) {
+			isc_mem_free(qp->mctx, multi->rollback->base);
 		}
+		if (multi->rollback->usage != NULL) {
+			isc_mem_free(qp->mctx, multi->rollback->usage);
+		}
+		isc_mem_free(qp->mctx, multi->rollback);
+	}
+	INSIST(multi->rollback == NULL);
+
+	/* not the first commit? */
+	if (multi->reader_ref != INVALID_REF) {
+		INSIST(cells_immutable(qp, multi->reader_ref));
+		free_twigs(qp, multi->reader_ref, READER_SIZE);
 	}
 
-#if HAVE_LIBURCU
-	rcu_assign_pointer(multi->read, qp);
-	/*
-	 * XXXFANF: At this point we need to wait for a grace period (to be
-	 * sure readers have finished) before recovering memory. This is not
-	 * very fast, hurting write throughput. To fix it we need read
-	 * transactions to be able to survive multiple write transactions, so
-	 * that it matters less if we are slow to detect when readers have
-	 * exited their critical sections. Instead of the current read / snap
-	 * distinction, we need to allocate a read snapshot when a
-	 * transaction commits, and clean it up (along with the unused
-	 * chunks) in an rcu callback.
-	 */
-	synchronize_rcu();
-#else
-	RWLOCK(&multi->rwlock, isc_rwlocktype_write);
-	multi->read = qp;
-	RWUNLOCK(&multi->rwlock, isc_rwlocktype_write);
-#endif
-
-	/*
-	 * Were the chunk arrays reallocated at some point?
-	 */
-	if (qp->shared_arrays) {
-		INSIST(old->base == qp->base);
-		INSIST(old->usage == qp->usage);
-		/* this becomes correct when `*old` is invalidated */
-		qp->shared_arrays = false;
+	if (qp->transaction_mode == QP_WRITE) {
+		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
 	} else {
-		INSIST(old->base != qp->base);
-		INSIST(old->usage != qp->usage);
-		free_chunk_arrays(old);
+		/* minimize memory overhead */
+		compact(qp);
+		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
+		qp->base->ptr[qp->bump] = chunk_shrink_raw(
+			qp, qp->base->ptr[qp->bump],
+			qp->usage[qp->bump].used * sizeof(qp_node_t));
 	}
 
-	/*
-	 * It is safe to recycle all empty chunks if they aren't being
-	 * used by snapshots.
-	 */
-	qp->hold_count = 0;
-	if (multi->snapshots == 0) {
-		recycle(qp);
+	/* anchor a new version of the trie */
+	qp_node_t *reader = ref_ptr(qp, multi->reader_ref);
+	make_reader(reader, multi);
+	/* paired with chunk_free() */
+	isc_refcount_increment(&qp->base->refcount);
+
+	/* reader_open() below has the matching atomic_load_acquire() */
+	atomic_store_release(&multi->reader, reader); /* COMMIT */
+
+	/* clean up what we can right now */
+	recycle(qp);
+
+	/* the reclamation phase must be sampled after the commit */
+	isc_qsbr_phase_t phase = isc_qsbr_phase(multi->loopmgr);
+	if (defer_chunk_reclamation(qp, phase)) {
+		ISC_ASTACK_ADD(qsbr_work, multi, cleanup);
+		isc_qsbr_activate(multi->loopmgr, phase);
 	}
 
-	*old = (dns_qp_t){};
 	*qptp = NULL;
 	UNLOCK(&multi->mutex);
 }
@@ -1058,37 +1176,51 @@ dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) {
 	unsigned int free = 0;
 
 	REQUIRE(QPMULTI_VALID(multi));
-	REQUIRE(qptp != NULL);
-	REQUIRE(*qptp == write_phase(multi));
+	REQUIRE(multi->writer.transaction_mode == QP_UPDATE);
+	REQUIRE(qptp != NULL && *qptp == &multi->writer);
 
 	dns_qp_t *qp = *qptp;
-
-	REQUIRE(qp->transaction_mode == QP_UPDATE);
 	TRACE("");
 
 	isc_nanosecs_t start = isc_time_monotonic();
 
-	/*
-	 * recycle any chunks allocated in this transaction,
-	 * including the bump chunk, and detach value objects
-	 */
 	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
-		if (qp->base[chunk] != NULL && !chunk_immutable(qp, chunk)) {
+		if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable)
+		{
 			chunk_free(qp, chunk);
+			/*
+			 * we need to clear its base pointer in the rollback
+			 * trie, in case the arrays were resized
+			 */
+			if (chunk < multi->rollback->chunk_max) {
+				INSIST(!multi->rollback->usage[chunk].exists);
+				multi->rollback->base->ptr[chunk] = NULL;
+			}
 			free++;
 		}
 	}
 
-	/* free the cloned arrays */
-	INSIST(!qp->shared_arrays);
-	free_chunk_arrays(qp);
+	/*
+	 * multi->rollback->base and multi->writer->base are the same,
+	 * unless there was a realloc_chunk_arrays() during the transaction
+	 */
+	if (qpbase_unref(qp)) {
+		/* paired with dns_qpmulti_update() */
+		isc_mem_free(qp->mctx, qp->base);
+	}
+	isc_mem_free(qp->mctx, qp->usage);
+
+	/* reset allocator state */
+	INSIST(multi->rollback != NULL);
+	memmove(qp, multi->rollback, sizeof(*qp));
+	isc_mem_free(qp->mctx, multi->rollback);
+	INSIST(multi->rollback == NULL);
 
 	isc_nanosecs_t time = isc_time_monotonic() - start;
 	atomic_fetch_add_relaxed(&rollback_time, time);
 
 	LOG_STATS("qp rollback" PRItime "free %u chunks", time, free);
 
-	*qp = (dns_qp_t){};
 	*qptp = NULL;
 	UNLOCK(&multi->mutex);
 }
@@ -1098,42 +1230,42 @@ dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) {
  *  read-only transactions
  */
 
+static dns_qpmulti_t *
+reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) {
+	dns_qpreader_t *qp = dns_qpreader(qpr);
+	/* dns_qpmulti_commit() has the matching atomic_store_release() */
+	qp_node_t *reader = atomic_load_acquire(&multi->reader);
+	if (reader == NULL) {
+		QP_INIT(qp, multi->writer.methods, multi->writer.uctx);
+	} else {
+		multi = unpack_reader(qp, reader);
+	}
+	return (multi);
+}
+
 /*
  * a query is light
  */
 
 void
-dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t **qprp) {
+dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) {
 	REQUIRE(QPMULTI_VALID(multi));
-	REQUIRE(qprp != NULL && *qprp == NULL);
+	REQUIRE(qp != NULL);
 
-#if HAVE_LIBURCU
-	rcu_read_lock();
-	*qprp = (dns_qpread_t *)rcu_dereference(multi->read);
-#else
-	RWLOCK(&multi->rwlock, isc_rwlocktype_read);
-	*qprp = (dns_qpread_t *)multi->read;
-#endif
+	dns_qpmulti_t *whence = reader_open(multi, qp);
+	INSIST(whence == multi);
+
+	/* we must be in an isc_loop thread */
+	qp->tid = isc_tid();
+	REQUIRE(qp->tid != ISC_TID_UNKNOWN);
 }
 
 void
-dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t **qprp) {
+dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) {
 	REQUIRE(QPMULTI_VALID(multi));
-	REQUIRE(qprp != NULL && *qprp != NULL);
-
-	/*
-	 * when we are using RCU, then multi->read can change during
-	 * our critical section, so it can be different from *qprp
-	 */
-	dns_qp_t *qp = (dns_qp_t *)*qprp;
-	*qprp = NULL;
-	REQUIRE(qp == &multi->phase[0] || qp == &multi->phase[1]);
-
-#if HAVE_LIBURCU
-	rcu_read_unlock();
-#else
-	RWUNLOCK(&multi->rwlock, isc_rwlocktype_read);
-#endif
+	REQUIRE(QP_VALID(qp));
+	REQUIRE(qp->tid == isc_tid());
+	*qp = (dns_qpread_t){};
 }
 
 /*
@@ -1142,73 +1274,64 @@ dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t **qprp) {
 
 void
 dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
-	dns_qp_t *old;
-	dns_qpsnap_t *qp;
-	size_t array_size, alloc_size;
-
 	REQUIRE(QPMULTI_VALID(multi));
 	REQUIRE(qpsp != NULL && *qpsp == NULL);
 
-	/*
-	 * we need a consistent view of the chunk base array and chunk_max so
-	 * we can't use the rwlock here (nor can we use dns_qpmulti_query)
-	 */
 	LOCK(&multi->mutex);
-	old = multi->read;
 
-	array_size = sizeof(qp_node_t *) * old->chunk_max;
-	alloc_size = sizeof(dns_qpsnap_t) + array_size;
-	qp = isc_mem_allocate(old->mctx, alloc_size);
-	*qp = (dns_qpsnap_t){
-		.magic = QP_MAGIC,
-		.root = old->root,
-		.methods = old->methods,
-		.uctx = old->uctx,
-		.generation = old->generation,
-		.base = qp->base_array,
-		.whence = multi,
-	};
-	/* sometimes we take a snapshot of an empty trie */
-	if (array_size > 0) {
-		memmove(qp->base, old->base, array_size);
+	dns_qp_t *qpw = &multi->writer;
+	size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) +
+		       sizeof(qpw->base->ptr[0]) * qpw->chunk_max;
+	dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes);
+	qps->whence = reader_open(multi, qps);
+	INSIST(qps->whence == multi);
+
+	/* not a separate allocation */
+	qps->base = (dns_qpbase_t *)(qps + 1);
+	isc_refcount_init(&qps->base->refcount, 0);
+
+	/*
+	 * only copy base pointers of chunks we need, so we can
+	 * reclaim unused memory in dns_qpsnap_destroy()
+	 */
+	qps->chunk_max = qpw->chunk_max;
+	for (qp_chunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
+		if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) {
+			qpw->usage[chunk].snapshot = true;
+			qps->base->ptr[chunk] = qpw->base->ptr[chunk];
+		} else {
+			qps->base->ptr[chunk] = NULL;
+		}
 	}
+	ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link);
 
-	multi->snapshots++;
-	*qpsp = qp;
-
-	TRACE("multi %p snaps %u", multi, multi->snapshots);
+	*qpsp = qps;
 	UNLOCK(&multi->mutex);
 }
 
 void
 dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
-	dns_qpsnap_t *qp;
-
 	REQUIRE(QPMULTI_VALID(multi));
 	REQUIRE(qpsp != NULL && *qpsp != NULL);
 
-	qp = *qpsp;
-	*qpsp = NULL;
+	LOCK(&multi->mutex);
+
+	dns_qpsnap_t *qp = *qpsp;
+
+	/* make sure the API is being used correctly */
+	REQUIRE(qp->whence == multi);
+
+	ISC_LIST_UNLINK(multi->snapshots, qp, link);
 
 	/*
-	 * `multi` and `whence` are redundant, but it helps
-	 * to make sure the API is being used correctly
+	 * eagerly reclaim chunks that are now unused, so that memory does
+	 * not accumulate when a trie has a lot of updates and snapshots
 	 */
-	REQUIRE(multi == qp->whence);
+	marksweep_chunks(multi);
 
-	LOCK(&multi->mutex);
-	TRACE("multi %p snaps %u gen %u", multi, multi->snapshots,
-	      multi->read->generation);
+	isc_mem_free(multi->writer.mctx, qp);
 
-	isc_mem_free(multi->read->mctx, qp);
-	multi->snapshots--;
-	if (multi->snapshots == 0) {
-		/*
-		 * Clean up if there were updates while we were working,
-		 * and we are the last snapshot keeping the memory alive
-		 */
-		recycle(multi->read);
-	}
+	*qpsp = NULL;
 	UNLOCK(&multi->mutex);
 }
 
@@ -1217,23 +1340,6 @@ dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
  *  constructors, destructors
  */
 
-static void
-initialize_guts(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
-		dns_qp_t *qp) {
-	REQUIRE(methods != NULL);
-	REQUIRE(methods->attach != NULL);
-	REQUIRE(methods->detach != NULL);
-	REQUIRE(methods->makekey != NULL);
-	REQUIRE(methods->triename != NULL);
-
-	*qp = (dns_qp_t){
-		.magic = QP_MAGIC,
-		.methods = methods,
-		.uctx = uctx,
-	};
-	isc_mem_attach(mctx, &qp->mctx);
-}
-
 void
 dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
 	      dns_qp_t **qptp) {
@@ -1242,14 +1348,16 @@ dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
 	REQUIRE(qptp != NULL && *qptp == NULL);
 
 	qp = isc_mem_get(mctx, sizeof(*qp));
-	initialize_guts(mctx, methods, uctx, qp);
+	QP_INIT(qp, methods, uctx);
+	isc_mem_attach(mctx, &qp->mctx);
 	alloc_reset(qp);
 	TRACE("");
 	*qptp = qp;
 }
 
 void
-dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
+dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
+		   const dns_qpmethods_t *methods, void *uctx,
 		   dns_qpmulti_t **qpmp) {
 	dns_qpmulti_t *multi;
 	dns_qp_t *qp;
@@ -1259,19 +1367,21 @@ dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
 	multi = isc_mem_get(mctx, sizeof(*multi));
 	*multi = (dns_qpmulti_t){
 		.magic = QPMULTI_MAGIC,
-		.read = &multi->phase[0],
+		.reader_ref = INVALID_REF,
+		.loopmgr = loopmgr,
+		.cleanup = ISC_SLINK_INITIALIZER,
 	};
-	isc_rwlock_init(&multi->rwlock);
 	isc_mutex_init(&multi->mutex);
-
+	ISC_LIST_INIT(multi->snapshots);
 	/*
 	 * Do not waste effort allocating a bump chunk that will be thrown
 	 * away when a transaction is opened. dns_qpmulti_update() always
 	 * allocates; to ensure dns_qpmulti_write() does too, pretend the
 	 * previous transaction was an update
 	 */
-	qp = multi->read;
-	initialize_guts(mctx, methods, uctx, qp);
+	qp = &multi->writer;
+	QP_INIT(qp, methods, uctx);
+	isc_mem_attach(mctx, &qp->mctx);
 	qp->transaction_mode = QP_UPDATE;
 	TRACE("");
 	*qpmp = multi;
@@ -1279,21 +1389,20 @@ dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
 
 static void
 destroy_guts(dns_qp_t *qp) {
-	if (qp->leaf_count == 1) {
-		detach_leaf(qp, &qp->root);
-	}
 	if (qp->chunk_max == 0) {
 		return;
 	}
 	for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
-		if (qp->base[chunk] != NULL) {
+		if (qp->base->ptr[chunk] != NULL) {
 			chunk_free(qp, chunk);
 		}
 	}
 	ENSURE(qp->used_count == 0);
 	ENSURE(qp->free_count == 0);
-	ENSURE(qp->hold_count == 0);
-	free_chunk_arrays(qp);
+	ENSURE(isc_refcount_current(&qp->base->refcount) == 1);
+	isc_mem_free(qp->mctx, qp->base);
+	isc_mem_free(qp->mctx, qp->usage);
+	qp->magic = 0;
 }
 
 void
@@ -1323,18 +1432,23 @@ dns_qpmulti_destroy(dns_qpmulti_t **qpmp) {
 	REQUIRE(QPMULTI_VALID(*qpmp));
 
 	multi = *qpmp;
-	qp = multi->read;
+	qp = &multi->writer;
 	*qpmp = NULL;
 
 	REQUIRE(QP_VALID(qp));
-	REQUIRE(!QP_VALID(write_phase(multi)));
-	REQUIRE(multi->snapshots == 0);
+	REQUIRE(multi->rollback == NULL);
+	REQUIRE(ISC_LIST_EMPTY(multi->snapshots));
 
-	TRACE("");
-	destroy_guts(qp);
-	isc_mutex_destroy(&multi->mutex);
-	isc_rwlock_destroy(&multi->rwlock);
-	isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
+	LOCK(&multi->mutex);
+	if (ISC_SLINK_LINKED(multi, cleanup)) {
+		qp->destroy = true;
+		UNLOCK(&multi->mutex);
+	} else {
+		destroy_guts(qp);
+		UNLOCK(&multi->mutex);
+		isc_mutex_destroy(&multi->mutex);
+		isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
+	}
 }
 
 /***********************************************************************
@@ -1364,9 +1478,12 @@ dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) {
 
 	/* first leaf in an empty trie? */
 	if (qp->leaf_count == 0) {
-		qp->root = new_leaf;
+		new_ref = alloc_twigs(qp, 1);
+		new_twigs = ref_ptr(qp, new_ref);
+		*new_twigs = new_leaf;
+		attach_leaf(qp, new_twigs);
 		qp->leaf_count++;
-		attach_leaf(qp, &new_leaf);
+		qp->root_ref = new_ref;
 		return (ISC_R_SUCCESS);
 	}
 
@@ -1378,7 +1495,7 @@ dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) {
 	 * we may get an out-of-bounds access if our bit is greater
 	 * than all the set bits in the node.
 	 */
-	n = &qp->root;
+	n = ref_ptr(qp, qp->root_ref);
 	while (is_branch(n)) {
 		prefetch_twigs(qp, n);
 		bit = branch_keybit(n, new_key, new_keylen);
@@ -1396,7 +1513,7 @@ dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) {
 	old_bit = qpkey_bit(old_key, old_keylen, offset);
 
 	/* find where to insert a branch or grow an existing branch. */
-	n = &qp->root;
+	n = make_root_mutable(qp);
 	while (is_branch(n)) {
 		prefetch_twigs(qp, n);
 		if (offset < branch_key_offset(n)) {
@@ -1480,8 +1597,12 @@ dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key,
 
 	REQUIRE(QP_VALID(qp));
 
+	if (get_root(qp) == NULL) {
+		return (ISC_R_NOTFOUND);
+	}
+
 	parent = NULL;
-	n = &qp->root;
+	n = make_root_mutable(qp);
 	while (is_branch(n)) {
 		prefetch_twigs(qp, n);
 		bit = branch_keybit(n, search_key, search_keylen);
@@ -1493,11 +1614,6 @@ dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key,
 		n = branch_twig_ptr(qp, n, bit);
 	}
 
-	/* empty trie? */
-	if (leaf_pval(n) == NULL) {
-		return (ISC_R_NOTFOUND);
-	}
-
 	found_keylen = leaf_qpkey(qp, n, found_key);
 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
 	    QPKEY_EQUAL)
@@ -1505,13 +1621,15 @@ dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key,
 		return (ISC_R_NOTFOUND);
 	}
 
-	qp->leaf_count--;
 	detach_leaf(qp, n);
+	qp->leaf_count--;
 
 	/* trie becomes empty */
 	if (qp->leaf_count == 0) {
-		INSIST(n == &qp->root && parent == NULL);
-		zero_twigs(n, 1);
+		INSIST(parent == NULL);
+		INSIST(n == get_root(qp));
+		free_twigs(qp, qp->root_ref, 1);
+		qp->root_ref = INVALID_REF;
 		return (ISC_R_SUCCESS);
 	}
 
@@ -1559,7 +1677,7 @@ dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name) {
 isc_result_t
 dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key,
 	      size_t search_keylen, void **pval_r, uint32_t *ival_r) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
+	dns_qpreader_t *qp = dns_qpreader(qpr);
 	dns_qpkey_t found_key;
 	size_t found_keylen;
 	qp_shift_t bit;
@@ -1569,7 +1687,11 @@ dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key,
 	REQUIRE(pval_r != NULL);
 	REQUIRE(ival_r != NULL);
 
-	n = &qp->root;
+	n = get_root(qp);
+	if (n == NULL) {
+		return (ISC_R_NOTFOUND);
+	}
+
 	while (is_branch(n)) {
 		prefetch_twigs(qp, n);
 		bit = branch_keybit(n, search_key, search_keylen);
@@ -1579,11 +1701,6 @@ dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key,
 		n = branch_twig_ptr(qp, n, bit);
 	}
 
-	/* empty trie? */
-	if (leaf_pval(n) == NULL) {
-		return (ISC_R_NOTFOUND);
-	}
-
 	found_keylen = leaf_qpkey(qp, n, found_key);
 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
 	    QPKEY_EQUAL)
diff --git a/lib/dns/qp_p.h b/lib/dns/qp_p.h
index 0a262b7ac4..cbaefb6f88 100644
--- a/lib/dns/qp_p.h
+++ b/lib/dns/qp_p.h
@@ -13,6 +13,8 @@
 
 /*
  * For an overview, see doc/design/qp-trie.md
+ *
+ * This private header defines the internal data structures,
  */
 
 #pragma once
@@ -23,12 +25,15 @@
  */
 
 /*
- * A qp-trie node can be a leaf or a branch. It consists of three 32-bit
- * words into which the components are packed. They are used as a 64-bit
- * word and a 32-bit word, but they are not declared like that to avoid
- * unwanted padding, keeping the size down to 12 bytes. They are in native
- * endian order so getting the 64-bit part should compile down to an
- * unaligned load.
+ * A qp-trie node is normally either a branch or a leaf. It consists of
+ * three 32-bit words into which the components are packed. They are used
+ * as a 64-bit word and a 32-bit word, but they are not declared like that
+ * to avoid unwanted padding, keeping the size down to 12 bytes. They are
+ * in native endian order so getting the 64-bit part should compile down to
+ * an unaligned load.
+ *
+ * The type of node is identified by the tag in the least significant bits
+ * of the 64-bit word.
  *
  * In a branch the 64-bit word is described by the enum below. The 32-bit
  * word is a reference to the packed sparse vector of "twigs", i.e. child
@@ -37,8 +42,12 @@
  * actually branch, i.e. branches cannot have only 1 child.
  *
  * The contents of each leaf are set by the trie's user. The 64-bit word
- * contains a pointer value (which must be word-aligned), and the 32-bit
- * word is an arbitrary integer value.
+ * contains a pointer value (which must be word-aligned, so the tag bits
+ * are zero), and the 32-bit word is an arbitrary integer value.
+ *
+ * There is a third kind of node, reader nodes, which anchor the root of a
+ * trie. A pair of reader nodes together contain a packed `dns_qpreader_t`.
+ * See the section on "packed reader nodes" below.
  */
 typedef struct qp_node {
 #if WORDS_BIGENDIAN
@@ -49,16 +58,36 @@ typedef struct qp_node {
 } qp_node_t;
 
 /*
- * A branch node contains a 64-bit word comprising the branch/leaf tag,
- * the bitmap, and an offset into the key. It is called an "index word"
- * because it describes how to access the twigs vector (think "database
- * index"). The following enum sets up the bit positions of these parts.
+ * The possible values of the node type tag. Type tags must fit in two bits
+ * for compatibility with 4-byte pointer alignment on 32-bit systems.
+ */
+enum {
+	LEAF_TAG = 0,	/* leaf node */
+	BRANCH_TAG = 1, /* branch node */
+	READER_TAG = 2, /* reader node */
+	TAG_MASK = 3,	/* mask covering tag bits */
+};
+
+/*
+ * This code does not work on CPUs with large pointers, e.g. CHERI capability
+ * architectures. When porting to that kind of machine, a `dns_qpnode` should
+ * be just a `uintptr_t`; a leaf node will contain a single pointer, and a
+ * branch node will fit in the same space with room to spare.
+ */
+STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t),
+	      "pointers must fit in 64 bits");
+
+/*
+ * A branch node contains a 64-bit word comprising the type tag, the
+ * bitmap, and an offset into the key. It is called an "index word" because
+ * it describes how to access the twigs vector (think "database index").
+ * The following enum sets up the bit positions of these parts.
  *
  * In a leaf, the same 64-bit word contains a pointer. The pointer
  * must be word-aligned so that the branch/leaf tag bit is zero.
  * This requirement is checked by the newleaf() constructor.
  *
- * The bitmap is just above the tag bit. The `bits_for_byte[]` table is
+ * The bitmap is just above the type tag. The `bits_for_byte[]` table is
  * used to fill in a key so that bit tests can work directly against the
  * index word without superfluous masking or shifting; we don't need to
  * mask out the bitmap before testing a bit, but we do need to mask the
@@ -70,24 +99,17 @@ typedef struct qp_node {
  * The names are SHIFT_thing because they are qp_shift_t values. (See
  * below for the various `qp_*` type declarations.)
  *
- * These values are relatively fixed in practice; the symbolic names
- * avoid mystery numbers in the code.
+ * These values are relatively fixed in practice: SHIFT_NOBYTE needs
+ * to leave space for the type tag, and the implementation of
+ * `dns_qpkey_fromname()` depends on the bitmap being large enough.
+ * The symbolic names avoid mystery numbers in the code.
  */
 enum {
-	SHIFT_BRANCH = 0,  /* branch / leaf tag */
-	SHIFT_NOBYTE,	   /* label separator has no byte value */
+	SHIFT_NOBYTE = 2,  /* label separator has no byte value */
 	SHIFT_BITMAP,	   /* many bits here */
-	SHIFT_OFFSET = 48, /* offset of byte in key */
+	SHIFT_OFFSET = 49, /* offset of byte in key */
 };
 
-/*
- * Value of the node type tag bit.
- *
- * It is defined this way to be explicit about where the value comes
- * from, even though we know it is always the bottom bit.
- */
-#define BRANCH_TAG (1ULL << SHIFT_BRANCH)
-
 /***********************************************************************
  *
  *  garbage collector tuning parameters
@@ -123,7 +145,13 @@ STATIC_ASSERT(6 <= QP_CHUNK_LOG && QP_CHUNK_LOG <= 20,
 #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(qp_node_t))
 
 /*
- * A chunk needs to be compacted if it has fragmented this much.
+ * We need a bitfield this big to count how much of a chunk is in use:
+ * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG`.
+ */
+#define QP_USAGE_BITS (QP_CHUNK_LOG + 1)
+
+/*
+ * A chunk needs to be compacted if it is less full than this threshold.
  * (12% overhead seems reasonable)
  */
 #define QP_MAX_FREE (QP_CHUNK_SIZE / 8)
@@ -221,93 +249,198 @@ ref_cell(qp_ref_t ref) {
 	return (ref % QP_CHUNK_SIZE);
 }
 
+/*
+ * We should not use the `root_ref` in an empty trie, so we set it
+ * to a value that should trigger an obvious bug. See qp_init()
+ * and get_root() below.
+ */
+#define INVALID_REF ((qp_ref_t)~0UL)
+
+/***********************************************************************
+ *
+ *  chunk arrays
+ */
+
+/*
+ * A `dns_qp_t` contains two arrays holding information about each chunk.
+ *
+ * The `base` array holds pointers to the base of each chunk.
+ * The `usage` array hold the allocator's state for each chunk.
+ *
+ * The `base` array is used by the hot qp-trie traversal paths. It can
+ * be shared by multiple versions of a trie, which are tracked with a
+ * refcount. Old versions of the trie can retain old versions of the
+ * `base` array.
+ *
+ * In multithreaded code, the `usage` array is only used when the
+ * `dns_qpmulti_t` mutex is held, and there is only one version of
+ * it in active use (maybe with a snapshot for rollback support).
+ *
+ * The two arrays are separate because they have rather different
+ * access patterns, different lifetimes, and different element sizes.
+ */
+
+/*
+ * For most purposes we don't need to know exactly which cells are
+ * in use in a chunk, we only need to know how many of them there are.
+ *
+ * After we have finished allocating from a chunk, the `used` counter
+ * is the size we need to know for shrinking the chunk and for
+ * scanning it to detach leaf values before the chunk is free()d. The
+ * `free` counter tells us when the chunk needs compacting and when it
+ * has become empty.
+ *
+ * The `exists` flag allows the chunk scanning loops to look at the
+ * usage array only.
+ *
+ * In multithreaded code, we mark chunks as `immutable` when a modify
+ * transaction is opened. (We don't mark them immutable on commit,
+ * because the old bump chunk must remain mutable between write
+ * transactions, but it must become immutable when an update
+ * transaction is opened.)
+ *
+ * When a chunk becomes empty (wrt the latest version of the trie), we
+ * note the QSBR phase after which no old versions of the trie will
+ * need the chunk and it will be safe to free(). There are a few flags
+ * used to mark which chunks are still needed by snapshots after the
+ * chunks have passed their normal reclamation phase.
+ */
+typedef struct qp_usage {
+	/*% the allocation point, increases monotonically */
+	qp_cell_t used : QP_USAGE_BITS;
+	/*% count of nodes no longer needed, also monotonic */
+	qp_cell_t free : QP_USAGE_BITS;
+	/*% qp->base->ptr[chunk] != NULL */
+	bool exists : 1;
+	/*% is this chunk shared? [MT] */
+	bool immutable : 1;
+	/*% is a snapshot using this chunk? [MT] */
+	bool snapshot : 1;
+	/*% tried to free it but a snapshot needs it [MT] */
+	bool snapfree : 1;
+	/*% for mark/sweep snapshot flag updates [MT] */
+	bool snapmark : 1;
+	/*% in which phase did this chunk become unused? [MT] */
+	isc_qsbr_phase_t phase : ISC_QSBR_PHASE_BITS;
+} qp_usage_t;
+
+/*
+ * The chunks are owned by the current version of the `base` array.
+ * When the array is resized, the old version might still be in use by
+ * concurrent readers, in which case it is free()d later when its
+ * refcount drops to zero.
+ *
+ * A `dns_qpbase_t` counts references from `dns_qp_t` objects and
+ * from packed readers, but not from `dns_qpread_t` nor from
+ * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t`
+ * would wreck multicore scalability; instead we rely on QSBR.
+ *
+ * The `usage` array determines when a chunk is no longer needed: old
+ * chunk pointers in old `base` arrays are ignored. (They can become
+ * dangling pointers to free memory, but they will never be
+ * dereferenced.)
+ *
+ * We ensure that individual chunk base pointers remain immutable
+ * after assignment, and they are not cleared until the chunk is
+ * free()d, after all readers have departed. Slots can be reused, and
+ * we allow transactions to fill or re-fill empty slots adjacent to
+ * busy slots that are in use by readers.
+ */
+struct dns_qpbase {
+	isc_refcount_t refcount;
+	qp_node_t *ptr[];
+};
+
+/*
+ * Returns true when the base array can be free()d.
+ */
+static inline bool
+qpbase_unref(dns_qpreadable_t qpr) {
+	dns_qpreader_t *qp = dns_qpreader(qpr);
+	return (qp->base != NULL &&
+		isc_refcount_decrement(&qp->base->refcount) == 1);
+}
+
+/*
+ * Now we know about `dns_qpreader_t` and `dns_qpbase_t`,
+ * here's how we convert a twig reference into a pointer.
+ */
+static inline qp_node_t *
+ref_ptr(dns_qpreadable_t qpr, qp_ref_t ref) {
+	dns_qpreader_t *qp = dns_qpreader(qpr);
+	return (qp->base->ptr[ref_chunk(ref)] + ref_cell(ref));
+}
+
 /***********************************************************************
  *
  *  main qp-trie structures
  */
 
-#define QP_MAGIC     ISC_MAGIC('t', 'r', 'i', 'e')
-#define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC)
+#define QP_MAGIC       ISC_MAGIC('t', 'r', 'i', 'e')
+#define QPMULTI_MAGIC  ISC_MAGIC('q', 'p', 'm', 'v')
+#define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x')
+
+#define QP_VALID(qp)	  ISC_MAGIC_VALID(qp, QP_MAGIC)
+#define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC)
 
 /*
- * This is annoying: C doesn't allow us to use a predeclared structure as
- * an anonymous struct member, so we have to fart around. The feature we
- * want is available in GCC and Clang with -fms-extensions, but a
- * non-standard extension won't make these declarations neater if we must
- * also have a standard alternative.
+ * Polymorphic initialization of the `dns_qpreader_t` prefix.
+ *
+ * The location of the root node is actually a qp_ref_t, but is
+ * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too
+ * many internal details into the public API.
+ *
+ * The `uctx` and `methods` support callbacks into the user's code.
+ * They are constant after initialization.
  */
+#define QP_INIT(qp, m, x)                 \
+	(*(qp) = (typeof(*(qp))){         \
+		 .magic = QP_MAGIC,       \
+		 .root_ref = INVALID_REF, \
+		 .uctx = x,               \
+		 .methods = m,            \
+	 })
 
 /*
- * Lightweight read-only access to a qp-trie.
+ * Snapshots have some extra cleanup machinery.
  *
- * Just the fields neded for the hot path. The `base` field points
- * to an array containing pointers to the base of each chunk like
- * `qp->base[chunk]` - see `refptr()` below.
+ * Originally, a snapshot was basically just a `dns_qpread_t`
+ * allocated on the heap, with the extra behaviour that memory
+ * reclamation is suppressed for a particular trie while it has any
+ * snapshots. However that design gets into trouble for a zone with
+ * frequent updates and many zone transfers.
  *
- * A `dns_qpread_t` has a lifetime that does not extend across multiple
- * write transactions, so it can share a chunk `base` array belonging to
- * the `dns_qpmulti_t` it came from.
+ * Instead, each snapshot records which chunks it needs. When a
+ * snapshot is created, it makes a copy of the `base` array, except
+ * for chunks that are empty and waiting to be reclaimed. When a
+ * snapshot is destroyed, we can traverse the list of snapshots to
+ * accurately mark which chunks are still needed.
  *
- * We're lucky with the layout on 64 bit systems: this is only 40 bytes,
- * with no padding.
- */
-#define DNS_QPREAD_COMMON \
-	uint32_t magic;   \
-	qp_node_t root;   \
-	qp_node_t **base; \
-	void *uctx;       \
-	const dns_qpmethods_t *methods
-
-struct dns_qpread {
-	DNS_QPREAD_COMMON;
-};
-
-/*
- * Heavyweight read-only snapshots of a qp-trie.
+ * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is
+ * not muddled up with the wrong `dns_qpmulti_t`.
  *
- * Unlike a lightweight `dns_qpread_t`, a snapshot can survive across
- * multiple write transactions, any of which may need to expand the
- * chunk `base` array. So a `dns_qpsnap_t` keeps its own copy of the
- * array, which will always be equal to some prefix of the expanded
- * arrays in the `dns_qpmulti_t` that it came from.
- *
- * The `dns_qpmulti_t` keeps a refcount of its snapshots, and while
- * the refcount is non-zero, chunks are not freed or reused. When a
- * `dns_qpsnap_t` is destroyed, if it decrements the refcount to zero,
- * it can do any deferred cleanup.
- *
- * The generation number is used for tracing.
+ * A trie's `base` array might have grown after the snapshot was
+ * created, so it records its own `chunk_max`.
  */
 struct dns_qpsnap {
-	DNS_QPREAD_COMMON;
-	uint32_t generation;
+	DNS_QPREADER_FIELDS;
 	dns_qpmulti_t *whence;
-	qp_node_t *base_array[];
+	uint32_t chunk_max;
+	ISC_LINK(struct dns_qpsnap) link;
 };
 
 /*
  * Read-write access to a qp-trie requires extra fields to support the
  * allocator and garbage collector.
  *
- * The chunk `base` and `usage` arrays are separate because the `usage`
- * array is only needed for allocation, so it is kept separate from the
- * data needed by the read-only hot path. The arrays have empty slots where
- * new chunks can be placed, so `chunk_max` is the maximum number of chunks
- * (until the arrays are resized).
- *
  * Bare instances of a `struct dns_qp` are used for stand-alone
- * single-threaded tries. For multithreaded access, transactions alternate
- * between the `phase` pair of dns_qp objects inside a dns_qpmulti.
+ * single-threaded tries. For multithreaded access, a `dns_qpmulti_t`
+ * wraps a `dns_qp_t` with a mutex and other fields that are only needed
+ * at the start or end of a transaction.
  *
- * For multithreaded access, the `generation` counter allows us to know
- * which chunks are writable or not: writable chunks were allocated in the
- * current generation. For single-threaded access, the generation counter
- * is always zero, so all chunks are considered to be writable.
- *
- * Allocations are made sequentially in the `bump` chunk. Lightweight write
- * transactions can re-use the `bump` chunk, so its prefix before `fender`
- * is immutable, and the rest is mutable even though its generation number
- * does not match the current generation.
+ * Allocations are made sequentially in the `bump` chunk. A sequence
+ * of lightweight write transactions can use the same `bump` chunk, so
+ * its prefix before `fender` is immutable, and the rest is mutable.
  *
  * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines
  * the values of `used_count`, `free_count`, and `hold_count`. The
@@ -332,39 +465,25 @@ struct dns_qpsnap {
  *    normal compaction failed to clear the QP_MAX_GARBAGE() condition.
  *    (This emergency is a bug even tho we have a rescue mechanism.)
  *
- *  - The `shared_arrays` flag indicates that the chunk `base` and `usage`
- *    arrays are shared by both `phase`s in this trie's `dns_qpmulti_t`.
- *    This allows us to delay allocating copies of the arrays during a
- *    write transaction, until we definitely need to resize them.
+ *  - When a qp-trie is destroyed while it has pending cleanup work, its
+ *    `destroy` flag is set so that it is destroyed by the reclaim worker.
+ *    (Because items cannot be removed from the middle of the cleanup list.)
  *
  *  - When built with fuzzing support, we can use mprotect() and munmap()
  *    to ensure that incorrect memory accesses cause fatal errors. The
  *    `write_protect` flag must be set straight after the `dns_qpmulti_t`
  *    is created, then left unchanged.
  *
- * Some of the dns_qp_t fields are only used for multithreaded transactions
+ * Some of the dns_qp_t fields are only needed for multithreaded transactions
  * (marked [MT] below) but the same code paths are also used for single-
- * threaded writes. To reduce the size of a dns_qp_t, these fields could
- * perhaps be moved into the dns_qpmulti_t, but that would require some kind
- * of conditional runtime downcast from dns_qp_t to dns_multi_t, which is
- * likely to be ugly. It is probably best to keep things simple if most tries
- * need multithreaded access (XXXFANF do they? e.g. when there are many auth
- * zones),
+ * threaded writes.
  */
 struct dns_qp {
-	DNS_QPREAD_COMMON;
+	DNS_QPREADER_FIELDS;
+	/*% memory context (const) */
 	isc_mem_t *mctx;
 	/*% array of per-chunk allocation counters */
-	struct {
-		/*% the allocation point, increases monotonically */
-		qp_cell_t used;
-		/*% count of nodes no longer needed, also monotonic */
-		qp_cell_t free;
-		/*% when was this chunk allocated? */
-		uint32_t generation;
-	} *usage;
-	/*% transaction counter [MT] */
-	uint32_t generation;
+	qp_usage_t *usage;
 	/*% number of slots in `chunk` and `usage` arrays */
 	qp_chunk_t chunk_max;
 	/*% which chunk is used for allocations */
@@ -375,14 +494,14 @@ struct dns_qp {
 	qp_cell_t leaf_count;
 	/*% total of all usage[] counters */
 	qp_cell_t used_count, free_count;
-	/*% cells that cannot be recovered right now */
+	/*% free cells that cannot be recovered right now */
 	qp_cell_t hold_count;
 	/*% what kind of transaction was most recently started [MT] */
 	enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2;
 	/*% compact the entire trie [MT] */
 	bool compact_all : 1;
-	/*% chunk arrays are shared with a readonly qp-trie [MT] */
-	bool shared_arrays : 1;
+	/*% destroy the trie asynchronously [MT] */
+	bool destroy : 1;
 	/*% optionally when compiled with fuzzing support [MT] */
 	bool write_protect : 1;
 };
@@ -390,45 +509,60 @@ struct dns_qp {
 /*
  * Concurrent access to a qp-trie.
  *
- * The `read` pointer is used for read queries. It points to one of the
- * `phase` elements. During a transaction, the other `phase` (see
- * `write_phase()` below) is modified incrementally in copy-on-write
- * style. On commit the `read` pointer is swapped to the altered phase.
+ * The `reader` pointer provides wait-free access to the current version
+ * of the trie. See the "packed reader nodes" section below for a
+ * description of what it points to.
+ *
+ * We need access to the loopmgr to hook into QSBR safe memory reclamation.
+ * It is constant after initialization.
+ *
+ * The main object under the protection of the mutex is the `writer`
+ * containing all the allocator state. There can be a backup copy when
+ * we want to be able to rollback an update transaction.
+ *
+ * There is a `reader_ref` which corresponds to the `reader` pointer
+ * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is
+ * necessary when freeing the space used by the reader, because there
+ * isn't a good way to recover a qp_ref_t from a qp_node_t pointer.
+ *
+ * There is a per-trie list of snapshots that is used for reclaiming
+ * memory when a snapshot is destroyed.
+ *
+ * Finally, we maintain a global list of `dns_qpmulti_t` objects that
+ * need asynchronous safe memory recovery.
  */
 struct dns_qpmulti {
 	uint32_t magic;
-	/*% controls access to the `read` pointer and its target phase */
-	isc_rwlock_t rwlock;
-	/*% points to phase[r] and swaps on commit */
-	dns_qp_t *read;
-	/*% protects the snapshot counter and `write_phase()` */
+	/*% safe memory reclamation context (const) */
+	isc_loopmgr_t *loopmgr;
+	/*% pointer to current packed reader */
+	atomic_ptr(qp_node_t) reader;
+	/*% the mutex protects the rest of this structure */
 	isc_mutex_t mutex;
-	/*% so we know when old chunks are still shared */
-	unsigned int snapshots;
-	/*% one is read-only, one is mutable */
-	dns_qp_t phase[2];
+	/*% ref_ptr(writer, reader_ref) == reader */
+	qp_ref_t reader_ref;
+	/*% the main working structure */
+	dns_qp_t writer;
+	/*% saved allocator state to support rollback */
+	dns_qp_t *rollback;
+	/*% all snapshots of this trie */
+	ISC_LIST(dns_qpsnap_t) snapshots;
+	/*% safe memory reclamation work list */
+	ISC_SLINK(dns_qpmulti_t) cleanup;
 };
 
-/*
- * Get a pointer to the phase that isn't read-only.
- */
-static inline dns_qp_t *
-write_phase(dns_qpmulti_t *multi) {
-	bool read0 = multi->read == &multi->phase[0];
-	return (read0 ? &multi->phase[1] : &multi->phase[0]);
-}
-
-#define QPMULTI_MAGIC	  ISC_MAGIC('q', 'p', 'm', 'v')
-#define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC)
-
 /***********************************************************************
  *
  *  interior node constructors and accessors
  */
 
 /*
- * See the comments under "interior node basics" above, which explain the
- * layout of nodes as implemented by the following functions.
+ * See the comments under "interior node basics" above, which explain
+ * the layout of nodes as implemented by the following functions.
+ *
+ * These functions are (mostly) constructors and getters. Imagine how
+ * much less code there would be if C had sum types with control over
+ * the layout...
  */
 
 /*
@@ -462,7 +596,24 @@ make_node(uint64_t big, uint32_t small) {
 }
 
 /*
- * Test a node's tag bit.
+ * Extract a pointer from a node's 64 bit word. The double cast is to avoid
+ * a warning about mismatched pointer/integer sizes on 32 bit systems.
+ */
+static inline void *
+node_pointer(qp_node_t *n) {
+	return ((void *)(uintptr_t)(node64(n) & ~TAG_MASK));
+}
+
+/*
+ * Examine a node's tag bits
+ */
+static inline uint32_t
+node_tag(qp_node_t *n) {
+	return (n->biglo & TAG_MASK);
+}
+
+/*
+ * simplified for the hot path
  */
 static inline bool
 is_branch(qp_node_t *n) {
@@ -472,12 +623,11 @@ is_branch(qp_node_t *n) {
 /* leaf nodes *********************************************************/
 
 /*
- * Get a leaf's pointer value. The double cast is to avoid a warning
- * about mismatched pointer/integer sizes on 32 bit systems.
+ * Get a leaf's pointer value.
  */
 static inline void *
 leaf_pval(qp_node_t *n) {
-	return ((void *)(uintptr_t)node64(n));
+	return (node_pointer(n));
 }
 
 /*
@@ -494,7 +644,7 @@ leaf_ival(qp_node_t *n) {
 static inline qp_node_t
 make_leaf(const void *pval, uint32_t ival) {
 	qp_node_t leaf = make_node((uintptr_t)pval, ival);
-	REQUIRE(!is_branch(&leaf) && pval != NULL);
+	REQUIRE(node_tag(&leaf) == LEAF_TAG);
 	return (leaf);
 }
 
@@ -551,15 +701,6 @@ branch_keybit(qp_node_t *n, const dns_qpkey_t key, size_t len) {
 	return (qpkey_bit(key, len, branch_key_offset(n)));
 }
 
-/*
- * Convert a twig reference into a pointer.
- */
-static inline qp_node_t *
-ref_ptr(dns_qpreadable_t qpr, qp_ref_t ref) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
-	return (qp->base[ref_chunk(ref)] + ref_cell(ref));
-}
-
 /*
  * Get a pointer to a branch node's twigs vector.
  */
@@ -576,6 +717,33 @@ prefetch_twigs(dns_qpreadable_t qpr, qp_node_t *n) {
 	__builtin_prefetch(branch_twigs_vector(qpr, n));
 }
 
+/* root node **********************************************************/
+
+/*
+ * Get a pointer to the root node, checking if the trie is empty.
+ */
+static inline qp_node_t *
+get_root(dns_qpreadable_t qpr) {
+	dns_qpreader_t *qp = dns_qpreader(qpr);
+	if (qp->root_ref == INVALID_REF) {
+		return (NULL);
+	} else {
+		return (ref_ptr(qp, qp->root_ref));
+	}
+}
+
+/*
+ * When we need to move the root node, we avoid repeating allocation
+ * logistics by making a temporary fake branch node that has
+ *	`branch_twigs_size() == 1 && branch_twigs_ref() == root_ref`
+ * just enough to treat the root node as a vector of one twig.
+ */
+#define MOVABLE_ROOT(qp)                                   \
+	(&(qp_node_t){                                     \
+		.biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \
+		.small = qp->root_ref,                     \
+	})
+
 /***********************************************************************
  *
  *  bitmap popcount shenanigans
@@ -585,26 +753,26 @@ prefetch_twigs(dns_qpreadable_t qpr, qp_node_t *n) {
  * How many twigs appear in the vector before the one corresponding to the
  * given bit? Calculated using popcount of part of the branch's bitmap.
  *
- * To calculate a mask that covers the lesser bits in the bitmap, we
- * subtract 1 to set the bits, and subtract the branch tag because it
- * is not part of the bitmap.
+ * To calculate a mask that covers the lesser bits in the bitmap,
+ * we subtract 1 to set all lesser bits, and subtract the tag mask
+ * because the type tag is not part of the bitmap.
  */
 static inline qp_weight_t
-branch_twigs_before(qp_node_t *n, qp_shift_t bit) {
-	uint64_t mask = (1ULL << bit) - 1 - BRANCH_TAG;
-	uint64_t bmp = branch_index(n) & mask;
-	return ((qp_weight_t)__builtin_popcountll(bmp));
+branch_count_bitmap_before(qp_node_t *n, qp_shift_t bit) {
+	uint64_t mask = (1ULL << bit) - 1 - TAG_MASK;
+	uint64_t bitmap = branch_index(n) & mask;
+	return ((qp_weight_t)__builtin_popcountll(bitmap));
 }
 
 /*
- * How many twigs does this node have?
+ * How many twigs does this branch have?
  *
  * The offset is directly after the bitmap so the offset's lesser bits
  * covers the whole bitmap, and the bitmap's weight is the number of twigs.
  */
 static inline qp_weight_t
 branch_twigs_size(qp_node_t *n) {
-	return (branch_twigs_before(n, SHIFT_OFFSET));
+	return (branch_count_bitmap_before(n, SHIFT_OFFSET));
 }
 
 /*
@@ -612,7 +780,7 @@ branch_twigs_size(qp_node_t *n) {
  */
 static inline qp_weight_t
 branch_twig_pos(qp_node_t *n, qp_shift_t bit) {
-	return (branch_twigs_before(n, bit));
+	return (branch_count_bitmap_before(n, bit));
 }
 
 /*
@@ -643,6 +811,80 @@ zero_twigs(qp_node_t *twigs, qp_weight_t size) {
 	memset(twigs, 0, size * sizeof(qp_node_t));
 }
 
+/***********************************************************************
+ *
+ *  packed reader nodes
+ */
+
+/*
+ * The purpose of these packed reader nodes is to simplify safe memory
+ * reclamation for a multithreaded qp-trie.
+ *
+ * After the `reader` pointer in a qpmulti is replaced, we need to wait
+ * for a grace period before we can reclaim the memory that is no longer
+ * needed by the trie. So we need some kind of structure to hold
+ * pointers to the (logically) detached memory until it is safe to free.
+ * This memory includes the chunks and the `base` arrays.
+ *
+ * Packed reader nodes save us from having to track `dns_qpread_t`
+ * objects as distinct allocations: the packed reader nodes get
+ * reclaimed when the the chunk containing their cells is reclaimed.
+ * When a real `dns_qpread_t` object is needed, it is allocated on the
+ * stack (it must not live longer than a isc_loop callback) and the
+ * packed reader is unpacked into it.
+ *
+ * Chunks are owned by the current `base` array, so unused chunks are
+ * held there until they are free()d. Old `base` arrays are attached
+ * to packed reader nodes with a refcount. When a chunk is reclaimed,
+ * it is scanned so that `chunk_free()` can call `detach_leaf()` on
+ * any remaining references to leaf objects. Similarly, it calls
+ * `qpbase_unref()` to reclaim old `base` arrays.
+ */
+
+/*
+ * Two nodes is just enough space for the information needed by
+ * readers and for deferred memory reclamation.
+ */
+#define READER_SIZE 2
+
+/*
+ * Create a packed reader; space for the reader should have been
+ * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`.
+ */
+static inline void
+make_reader(qp_node_t *reader, dns_qpmulti_t *multi) {
+	dns_qp_t *qp = &multi->writer;
+	reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC);
+	reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref);
+}
+
+static inline bool
+reader_valid(qp_node_t *reader) {
+	return (reader != NULL && //
+		node_tag(&reader[0]) == READER_TAG &&
+		node_tag(&reader[1]) == READER_TAG &&
+		node32(&reader[0]) == QPREADER_MAGIC);
+}
+
+/*
+ * Verify and unpack a reader. We return the `multi` pointer to use in
+ * consistency checks.
+ */
+static inline dns_qpmulti_t *
+unpack_reader(dns_qpreader_t *qp, qp_node_t *reader) {
+	INSIST(reader_valid(reader));
+	dns_qpmulti_t *multi = node_pointer(&reader[0]);
+	INSIST(QPMULTI_VALID(multi));
+	*qp = (dns_qpreader_t){
+		.magic = QP_MAGIC,
+		.uctx = multi->writer.uctx,
+		.methods = multi->writer.methods,
+		.root_ref = node32(&reader[1]),
+		.base = node_pointer(&reader[1]),
+	};
+	return (multi);
+}
+
 /***********************************************************************
  *
  *  method invocation helpers
@@ -650,26 +892,26 @@ zero_twigs(qp_node_t *twigs, qp_weight_t size) {
 
 static inline void
 attach_leaf(dns_qpreadable_t qpr, qp_node_t *n) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
+	dns_qpreader_t *qp = dns_qpreader(qpr);
 	qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n));
 }
 
 static inline void
 detach_leaf(dns_qpreadable_t qpr, qp_node_t *n) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
+	dns_qpreader_t *qp = dns_qpreader(qpr);
 	qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n));
 }
 
 static inline size_t
 leaf_qpkey(dns_qpreadable_t qpr, qp_node_t *n, dns_qpkey_t key) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
+	dns_qpreader_t *qp = dns_qpreader(qpr);
 	return (qp->methods->makekey(key, qp->uctx, leaf_pval(n),
 				     leaf_ival(n)));
 }
 
 static inline char *
 triename(dns_qpreadable_t qpr, char *buf, size_t size) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
+	dns_qpreader_t *qp = dns_qpreader(qpr);
 	qp->methods->triename(qp->uctx, buf, size);
 	return (buf);
 }
diff --git a/tests/bench/Makefile.am b/tests/bench/Makefile.am
index 5204031f32..749b7dc49c 100644
--- a/tests/bench/Makefile.am
+++ b/tests/bench/Makefile.am
@@ -1,11 +1,14 @@
 include $(top_srcdir)/Makefile.top
 
+AM_CFLAGS += -Wno-vla
+
 AM_CPPFLAGS +=				\
 	$(LIBUV_CFLAGS)			\
 	$(LIBISC_CFLAGS)		\
 	$(LIBDNS_CFLAGS)		\
 	-I$(top_srcdir)/fuzz		\
 	-I$(top_srcdir)/lib/dns		\
+	-I$(top_srcdir)/lib/isc		\
 	-I$(top_srcdir)/tests/include
 
 LDADD +=				\
diff --git a/tests/bench/load-names.c b/tests/bench/load-names.c
index de0d3e2902..455f20928f 100644
--- a/tests/bench/load-names.c
+++ b/tests/bench/load-names.c
@@ -17,6 +17,8 @@
 #include <isc/file.h>
 #include <isc/hashmap.h>
 #include <isc/ht.h>
+#include <isc/list.h>
+#include <isc/qsbr.h>
 #include <isc/rwlock.h>
 #include <isc/util.h>
 
diff --git a/tests/bench/qpmulti.c b/tests/bench/qpmulti.c
index e86797a061..ed072804a9 100644
--- a/tests/bench/qpmulti.c
+++ b/tests/bench/qpmulti.c
@@ -11,43 +11,81 @@
  * information regarding copyright ownership.
  */
 
+#include <math.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <unistd.h>
 
+#include <isc/async.h>
+#include <isc/list.h>
 #include <isc/log.h>
+#include <isc/loop.h>
+#include <isc/magic.h>
 #include <isc/mem.h>
 #include <isc/mutex.h>
+#include <isc/os.h>
+#include <isc/qsbr.h>
 #include <isc/random.h>
 #include <isc/refcount.h>
 #include <isc/rwlock.h>
 #include <isc/thread.h>
+#include <isc/tid.h>
 #include <isc/time.h>
+#include <isc/timer.h>
 #include <isc/util.h>
+#include <isc/uv.h>
 
 #include <dns/log.h>
 #include <dns/qp.h>
 #include <dns/types.h>
 
+#include "loop_p.h"
 #include "qp_p.h"
 
 #include <tests/qp.h>
 
-#define ITEM_COUNT ((size_t)1000000)
+#define ITEM_COUNT	 ((size_t)1000000)
+#define RUNTIME		 (0.25 * NS_PER_SEC)
+#define MAX_OPS_PER_LOOP (1 << 10)
 
-#define MS_PER_SEC 1000
-#define US_PER_SEC 1000000
-#define NS_PER_SEC 1000000000
+#define VERBOSE 0
+#define ZIPF	0
 
-static double
-doubletime(isc_time_t t0, isc_time_t t1) {
-	return ((double)isc_time_microdiff(&t1, &t0) / (double)US_PER_SEC);
+#if VERBOSE
+#define TRACE(fmt, ...)                                                     \
+	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \
+		      ISC_LOG_DEBUG(7), "%s:%d:%s():t%d: " fmt, __FILE__,   \
+		      __LINE__, __func__, isc_tid(), ##__VA_ARGS__)
+#else
+#define TRACE(...)
+#endif
+
+#if ZIPF
+/*
+ * Zipf rejection sampling derived from code by Jason Crease
+ * https://jasoncrease.medium.com/rejection-sampling-the-zipf-distribution-6b359792cffa
+ */
+static uint32_t
+rand_zipf(uint32_t max, double skew) {
+	double s = skew;
+	double t = (pow(max, 1 - s) - s) / (1 - s);
+	for (;;) {
+		double p = t * (double)isc_random32() / UINT32_MAX;
+		double invB = p <= 1 ? p : pow(p * (1 - s) + s, 1 / (1 - s));
+		uint32_t sample = (uint32_t)(invB + 1);
+		double ratio = sample <= 1 ? pow(sample, -s)
+					   : pow(sample, -s) / pow(invB, -s);
+		if (ratio > (double)isc_random32() / UINT32_MAX) {
+			return sample - 1;
+		}
+	}
 }
+#endif
 
 static struct {
+	size_t len;
 	bool present;
-	uint8_t len;
 	dns_qpkey_t key;
 } *item;
 
@@ -86,7 +124,6 @@ random_byte(void) {
 
 static void
 init_items(isc_mem_t *mctx) {
-	isc_time_t t0, t1;
 	void *pval = NULL;
 	uint32_t ival = ~0U;
 	dns_qp_t *qp = NULL;
@@ -95,7 +132,7 @@ init_items(isc_mem_t *mctx) {
 
 	item = isc_mem_allocatex(mctx, bytes, ISC_MEM_ZERO);
 
-	isc_time_now_hires(&t0);
+	uint64_t start = isc_time_monotonic();
 
 	/* ensure there are no duplicate names */
 	dns_qp_create(mctx, &item_methods, NULL, &qp);
@@ -113,39 +150,9 @@ init_items(isc_mem_t *mctx) {
 	}
 	dns_qp_destroy(&qp);
 
-	isc_time_now_hires(&t1);
-	double time = doubletime(t0, t1);
-	printf("%f sec to create %zu items, %f/sec\n", time, ITEM_COUNT,
-	       ITEM_COUNT / time);
-}
-
-static void
-init_multi(isc_mem_t *mctx, dns_qpmulti_t **qpmp, uint32_t max) {
-	isc_time_t t0, t1;
-	dns_qpmulti_t *multi = NULL;
-	dns_qp_t *qp = NULL;
-	size_t count = 0;
-
-	isc_time_now_hires(&t0);
-
-	dns_qpmulti_create(mctx, &item_methods, NULL, qpmp);
-	multi = *qpmp;
-
-	/* initial contents of the trie */
-	dns_qpmulti_update(multi, &qp);
-	for (size_t i = 0; i < max; i++) {
-		if (isc_random_uniform(2) == 0) {
-			continue;
-		}
-		INSIST(dns_qp_insert(qp, &item[i], i) == ISC_R_SUCCESS);
-		item[i].present = true;
-		count++;
-	}
-	dns_qpmulti_commit(multi, &qp);
-
-	isc_time_now_hires(&t1);
-	double time = doubletime(t0, t1);
-	printf("%f sec to load %zu items, %f/sec\n", time, count, count / time);
+	double time = (double)(isc_time_monotonic() - start) / NS_PER_SEC;
+	printf("%f sec to create %zu items, %f/sec %zu bytes\n", time,
+	       ITEM_COUNT, ITEM_COUNT / time, bytes);
 }
 
 static void
@@ -168,207 +175,736 @@ init_logging(isc_mem_t *mctx) {
 			      ISC_LOG_DYNAMIC, &destination,
 			      ISC_LOG_PRINTPREFIX | ISC_LOG_PRINTTIME |
 				      ISC_LOG_ISO8601);
-
-	// isc_log_setdebuglevel(lctx, 1);
+#if VERBOSE
+	isc_log_setdebuglevel(lctx, 7);
+#endif
 
 	result = isc_log_usechannel(logconfig, "stderr",
 				    ISC_LOGCATEGORY_DEFAULT, NULL);
 	INSIST(result == ISC_R_SUCCESS);
 }
 
-typedef void
-transaction_fun(dns_qpmulti_t *multi, uint32_t max, uint32_t ops,
-		uint64_t *absent_r, uint64_t *present_r);
-
-static transaction_fun read_transaction, update_transaction;
+static void
+collect(void *);
 
 struct thread_args {
-	transaction_fun *txfun; /* (in) */
-	dns_qpmulti_t *multi;	/* (in) */
-	isc_thread_t tid;	/* (in) */
-	uint32_t max;		/* item index (in) */
-	uint32_t ops;		/* per transaction (in) */
-	uint64_t absent;	/* items not found or inserted (out) */
-	uint64_t present;	/* items found or deleted (out) */
-	uint64_t transactions;	/* (out) */
-	isc_time_t t0;		/* (out) */
-	isc_time_t t1;		/* (out) */
+	struct bench_state *bctx; /* (in) */
+	isc_barrier_t *barrier;	  /* (in) */
+	isc_loopmgr_t *loopmgr;	  /* (in) */
+	uv_idle_t handle;	  /* (in) */
+	uv_idle_cb cb;		  /* (in) */
+	dns_qpmulti_t *multi;	  /* (in) */
+	double zipf_skew;	  /* (in) */
+	uint32_t max_item;	  /* (in) */
+	uint32_t ops_per_tx;	  /* (in) */
+	uint32_t tx_per_loop;	  /* (in) */
+	uint32_t absent;	  /* items not found or inserted (out) */
+	uint32_t present;	  /* items found or deleted (out) */
+	uint32_t compactions;	  /* (out) */
+	uint64_t transactions;	  /* (out) */
+	isc_nanosecs_t worked;	  /* (out) */
+	isc_nanosecs_t start;	  /* (out) */
+	isc_nanosecs_t stop;	  /* (out) */
 };
 
 static void
-read_transaction(dns_qpmulti_t *multi, uint32_t max, uint32_t ops,
-		 uint64_t *absent_r, uint64_t *present_r) {
-	dns_qpread_t *qp = NULL;
-	uint64_t absent = 0;
-	uint64_t present = 0;
+first_loop(void *varg) {
+	struct thread_args *args = varg;
+	isc_loop_t *loop = isc_loop_current(args->loopmgr);
+
+	uv_idle_init(&loop->loop, &args->handle);
+	uv_idle_start(&args->handle, args->cb);
+	args->handle.data = args;
+
+	isc_barrier_wait(args->barrier);
+	args->start = isc_time_monotonic();
+}
+
+static void
+next_loop(struct thread_args *args, isc_nanosecs_t start) {
+	isc_nanosecs_t stop = isc_time_monotonic();
+	args->worked += stop - start;
+	args->stop = stop;
+	if (args->stop - args->start < RUNTIME) {
+		return;
+	}
+	uv_idle_stop(&args->handle);
+	uv_close(&args->handle, NULL);
+	isc_async_run(isc_loop_main(args->loopmgr), collect, args);
+}
+
+#if ZIPF
+static void
+read_zipf(uv_idle_t *idle) {
+	struct thread_args *args = idle->data;
+
+	/* outside time because it is v slow */
+	uint32_t r[args->tx_per_loop][args->ops_per_tx];
+	for (uint32_t tx = 0; tx < args->tx_per_loop; tx++) {
+		for (uint32_t op = 0; op < args->ops_per_tx; op++) {
+			r[tx][op] = rand_zipf(args->max_item, args->zipf_skew);
+		}
+	}
+
+	isc_nanosecs_t start = isc_time_monotonic();
 	void *pval;
 	uint32_t ival;
-	isc_result_t result;
 
-	dns_qpmulti_query(multi, &qp);
-	for (uint32_t n = 0; n < ops; n++) {
-		uint32_t i = isc_random_uniform(max);
-		result = dns_qp_getkey(qp, item[i].key, item[i].len, &pval,
-				       &ival);
-		if (result == ISC_R_SUCCESS) {
-			++present;
-		} else {
-			++absent;
+	for (uint32_t tx = 0; tx < args->tx_per_loop; tx++) {
+		args->transactions++;
+		dns_qpread_t qp;
+		dns_qpmulti_query(args->multi, &qp);
+		for (uint32_t op = 0; op < args->ops_per_tx; op++) {
+			uint32_t i = r[tx][op];
+			isc_result_t result = dns_qp_getkey(
+				&qp, item[i].key, item[i].len, &pval, &ival);
+			if (result == ISC_R_SUCCESS) {
+				args->present++;
+			} else {
+				args->absent++;
+			}
 		}
+		dns_qpread_destroy(args->multi, &qp);
 	}
-	dns_qpread_destroy(multi, &qp);
-	*present_r = present;
-	*absent_r = absent;
+	next_loop(args, start);
+}
+#else
+#define read_zipf read_transactions
+#endif
+
+static void
+read_transactions(uv_idle_t *idle) {
+	struct thread_args *args = idle->data;
+	isc_nanosecs_t start = isc_time_monotonic();
+	void *pval;
+	uint32_t ival;
+
+	for (uint32_t tx = 0; tx < args->tx_per_loop; tx++) {
+		args->transactions++;
+		dns_qpread_t qp;
+		dns_qpmulti_query(args->multi, &qp);
+		for (uint32_t op = 0; op < args->ops_per_tx; op++) {
+			uint32_t i = isc_random_uniform(args->max_item);
+			isc_result_t result = dns_qp_getkey(
+				&qp, item[i].key, item[i].len, &pval, &ival);
+			if (result == ISC_R_SUCCESS) {
+				args->present++;
+			} else {
+				args->absent++;
+			}
+		}
+		dns_qpread_destroy(args->multi, &qp);
+	}
+	next_loop(args, start);
 }
 
 static void
-update_transaction(dns_qpmulti_t *multi, uint32_t max, uint32_t ops,
-		   uint64_t *absent_r, uint64_t *present_r) {
+mutate_transactions(uv_idle_t *idle) {
+	struct thread_args *args = idle->data;
+	isc_nanosecs_t start = isc_time_monotonic();
+
+	for (uint32_t tx = 0; tx < args->tx_per_loop; tx++) {
+		dns_qp_t *qp = NULL;
+		dns_qpmulti_write(args->multi, &qp);
+		for (uint32_t op = 0; op < args->ops_per_tx; op++) {
+			uint32_t i = isc_random_uniform(args->max_item);
+			if (item[i].present) {
+				isc_result_t result = dns_qp_deletekey(
+					qp, item[i].key, item[i].len);
+				INSIST(result == ISC_R_SUCCESS);
+				item[i].present = false;
+				args->present++;
+			} else {
+				isc_result_t result =
+					dns_qp_insert(qp, &item[i], i);
+				INSIST(result == ISC_R_SUCCESS);
+				item[i].present = true;
+				args->absent++;
+			}
+		}
+		if (dns_qp_memusage(qp).fragmented) {
+			dns_qp_compact(qp, false);
+			args->compactions++;
+		}
+		dns_qpmulti_commit(args->multi, &qp);
+		args->transactions++;
+	}
+	next_loop(args, start);
+}
+
+enum benchmode {
+	init,
+	vary_max_items_rw,
+	vary_max_items_ro,
+	vary_mut_read,
+	vary_read_only,
+	vary_mut_ops_per_tx,
+	vary_mut_tx_per_loop,
+	vary_read_ops_per_tx_rw,
+	vary_read_ops_per_tx_ro,
+	vary_read_tx_per_loop_rw,
+	vary_read_tx_per_loop_ro,
+	vary_zipf_skew,
+};
+
+struct bench_state {
+	isc_mem_t *mctx;
+	isc_barrier_t barrier;
+	isc_loopmgr_t *loopmgr;
+	dns_qpmulti_t *multi;
+	enum benchmode mode;
+	size_t bytes;
+	size_t qp_bytes;
+	size_t qp_items;
+	isc_nanosecs_t load_time;
+	uint32_t nloops;
+	uint32_t waiting;
+	uint32_t max_item;
+	uint32_t mutate;
+	uint32_t mut_ops_per_tx;
+	uint32_t mut_tx_per_loop;
+	uint32_t readers;
+	uint32_t read_ops_per_tx;
+	uint32_t read_tx_per_loop;
+	double zipf_skew;
+	struct thread_args thread[];
+};
+
+static void
+load_multi(struct bench_state *bctx) {
 	dns_qp_t *qp = NULL;
-	uint64_t absent = 0;
-	uint64_t present = 0;
-	isc_result_t result;
+	size_t count = 0;
 
-	if (multi->read->generation & 255) {
-		dns_qpmulti_write(multi, &qp);
-	} else {
-		dns_qpmulti_update(multi, &qp);
-	}
-	for (uint32_t n = 0; n < ops; n++) {
-		uint32_t i = isc_random_uniform(max);
-		if (item[i].present) {
-			result = dns_qp_deletekey(qp, item[i].key, item[i].len);
-			INSIST(result == ISC_R_SUCCESS);
+	uint64_t start = isc_time_monotonic();
+
+	dns_qpmulti_create(bctx->mctx, bctx->loopmgr, &item_methods, NULL,
+			   &bctx->multi);
+
+	/* initial contents of the trie */
+	dns_qpmulti_update(bctx->multi, &qp);
+	for (size_t i = 0; i < bctx->max_item; i++) {
+		if (isc_random_uniform(2) == 0) {
 			item[i].present = false;
-			++present;
-		} else {
-			result = dns_qp_insert(qp, &item[i], i);
-			INSIST(result == ISC_R_SUCCESS);
-			item[i].present = true;
-			++absent;
+			continue;
 		}
+		INSIST(dns_qp_insert(qp, &item[i], i) == ISC_R_SUCCESS);
+		item[i].present = true;
+		count++;
 	}
-	dns_qpmulti_commit(multi, &qp);
-	*present_r += present;
-	*absent_r += absent;
-}
+	dns_qp_compact(qp, true);
+	dns_qpmulti_commit(bctx->multi, &qp);
 
-static isc_refcount_t stop;
-
-static void *
-thread_loop(void *args_v) {
-	struct thread_args *args = args_v;
-	transaction_fun *txfun = args->txfun;
-	dns_qpmulti_t *multi = args->multi;
-	uint32_t max = args->max;
-	uint32_t ops = args->ops;
-	uint64_t absent = 0;
-	uint64_t present = 0;
-	uint64_t transactions = 0;
-
-#if HAVE_LIBURCU
-	rcu_register_thread();
-#endif
-	isc_time_now_hires(&args->t0);
-	while (isc_refcount_current(&stop) == 0) {
-		txfun(multi, max, ops, &absent, &present);
-		++transactions;
-	}
-	isc_time_now_hires(&args->t1);
-	args->absent = absent;
-	args->present = present;
-	args->transactions = transactions;
-#if HAVE_LIBURCU
-	rcu_unregister_thread();
-#endif
-	return (args);
+	bctx->load_time = isc_time_monotonic() - start;
+	bctx->qp_bytes = dns_qpmulti_memusage(bctx->multi).bytes;
+	bctx->qp_items = count;
 }
 
 static void
-dispatch_threads(dns_qpmulti_t *multi, useconds_t runtime, uint32_t max,
-		 uint32_t updaters, uint32_t updateops, uint32_t readers,
-		 uint32_t readops) {
-	struct thread_args thread[64];
-	uint32_t threads = updaters + readers;
+tsv_header(void) {
+	printf("runtime\t");
+	printf("elapsed\t");
+	printf(" load s\t");
+	printf(" B/item\t");
+	printf("  items\t");
 
-	REQUIRE(threads <= ARRAY_SIZE(thread));
+	printf("    mut\t");
+	printf("tx/loop\t");
+	printf(" ops/tx\t");
+	printf("     gc\t");
+	printf("   txns\t");
+	printf("    ops\t");
+	printf(" work s\t");
+	printf("txns/us\t");
+	printf(" ops/us\t");
 
-	for (uint32_t t = 0; t < threads; t++) {
-		thread[t] = (struct thread_args){
-			.txfun = t < updaters ? update_transaction
-					      : read_transaction,
-			.multi = multi,
-			.max = max,
-			.ops = t < updaters ? updateops : readops,
+	printf("   read\t");
+	printf("tx/loop\t");
+	printf(" ops/tx\t");
+	printf("  Ktxns\t");
+	printf("   Kops\t");
+	printf(" work s\t");
+	printf("txns/us\t");
+	printf(" ops/us\t");
+	printf("    raw\t");
+	printf("   loop\n");
+}
+
+/*
+ * This function sets up the parameters for each benchmark run and
+ * dispatches the work to the event loops. Each run is part of a
+ * series, where most of the parameters are fixed and one parameter is
+ * varied. The layout here is somewhat eccentric, in order to keep
+ * each series together.
+ *
+ * A series starts with an `init` block, which sets up the constant
+ * parameters and the variable parameter for the first run. Following
+ * the `init` block is a `case` label which adjusts the variable
+ * parameter for each subsequent run in the series, and checks when
+ * the series is finished. At the end of the series, we `goto` the
+ * `init` label for the next series.
+ */
+static void
+dispatch(struct bench_state *bctx) {
+	switch (bctx->mode) {
+	case init:
+		goto init_max_items_rw;
+
+	fini:;
+		isc_loopmgr_t *loopmgr = bctx->loopmgr;
+		dns_qpmulti_destroy(&bctx->multi);
+		isc_mem_putanddetach(&bctx->mctx, bctx, bctx->bytes);
+		isc_loopmgr_shutdown(loopmgr);
+		return;
+
+	init_max_items_rw:
+		bctx->mode = vary_max_items_rw;
+		printf("\n");
+		printf("vary size of trie\n");
+		tsv_header();
+		bctx->mutate = 1;
+		bctx->readers = bctx->nloops - 1;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 32;
+		bctx->read_tx_per_loop = 32;
+		bctx->max_item = 10;
+		load_multi(bctx);
+		break;
+	case vary_max_items_rw:
+		if (bctx->max_item == ITEM_COUNT) {
+			goto init_max_items_ro;
+		} else {
+			dns_qpmulti_destroy(&bctx->multi);
+			bctx->max_item *= 10;
+			load_multi(bctx);
+		}
+		break;
+
+	init_max_items_ro:
+		bctx->mode = vary_max_items_ro;
+		printf("\n");
+		printf("vary size of trie (readonly)\n");
+		tsv_header();
+		bctx->mutate = 0;
+		bctx->readers = bctx->nloops;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 32;
+		bctx->read_tx_per_loop = 32;
+		dns_qpmulti_destroy(&bctx->multi);
+		bctx->max_item = 10;
+		load_multi(bctx);
+		break;
+	case vary_max_items_ro:
+		if (bctx->max_item == ITEM_COUNT) {
+			goto init_zipf_skew;
+		} else {
+			dns_qpmulti_destroy(&bctx->multi);
+			bctx->max_item *= 10;
+			load_multi(bctx);
+		}
+		break;
+
+	init_zipf_skew:
+		bctx->mode = vary_zipf_skew;
+		printf("\n");
+		printf("vary zipf skew (readonly) "
+		       " [ cache friendliness? ]\n");
+		tsv_header();
+		bctx->mutate = 0;
+		bctx->readers = 0;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 32;
+		bctx->read_tx_per_loop = 32;
+		bctx->zipf_skew = 0.01;
+		/* dumb hack */
+		bctx->load_time = bctx->zipf_skew * NS_PER_SEC;
+		break;
+	case vary_zipf_skew:
+		bctx->zipf_skew += 0.1;
+		bctx->load_time = bctx->zipf_skew * NS_PER_SEC;
+		if (bctx->zipf_skew >= 1.0) {
+			bctx->zipf_skew = 0.0;
+			bctx->load_time = 0;
+			goto init_mut_read;
+		}
+		break;
+
+	init_mut_read:
+		bctx->mode = vary_mut_read;
+		printf("\n");
+		printf("vary mutate / read threads "
+		       "[ read perf per thread should be flat ]\n");
+		tsv_header();
+		bctx->mutate = bctx->nloops - 1;
+		bctx->readers = 1;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 32;
+		bctx->read_tx_per_loop = 32;
+		break;
+	case vary_mut_read:
+		if (bctx->mutate == 1) {
+			goto init_read_only;
+		} else {
+			bctx->mutate--;
+			bctx->readers++;
+		}
+		break;
+
+	init_read_only:
+		bctx->mode = vary_read_only;
+		printf("\n");
+		printf("vary read threads "
+		       "[ read perf per thread should be flat ]\n");
+		tsv_header();
+		bctx->mutate = 0;
+		bctx->readers = 1;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 32;
+		bctx->read_tx_per_loop = 32;
+		break;
+	case vary_read_only:
+		if (bctx->readers == bctx->nloops) {
+			goto init_mut_ops_per_tx;
+		} else {
+			bctx->readers++;
+		}
+		break;
+
+	init_mut_ops_per_tx:
+		bctx->mode = vary_mut_ops_per_tx;
+		printf("\n");
+		printf("vary mutate operations per transaction "
+		       "[ mutate activity affects read perf? ]\n");
+		tsv_header();
+		bctx->mutate = 1;
+		bctx->readers = bctx->nloops - 1;
+		bctx->mut_ops_per_tx = 1;
+		bctx->mut_tx_per_loop = 1;
+		bctx->read_ops_per_tx = 32;
+		bctx->read_tx_per_loop = 32;
+		break;
+	case vary_mut_ops_per_tx:
+		if (bctx->mut_ops_per_tx * bctx->mut_tx_per_loop ==
+		    MAX_OPS_PER_LOOP)
+		{
+			goto init_mut_tx_per_loop;
+		} else {
+			bctx->mut_ops_per_tx *= 2;
+		}
+		break;
+
+	init_mut_tx_per_loop:
+		bctx->mode = vary_mut_tx_per_loop;
+		printf("\n");
+		printf("vary mutate transactions per loop "
+		       "[ mutate activity affects read perf? ]\n");
+		tsv_header();
+		bctx->mutate = 1;
+		bctx->readers = bctx->nloops - 1;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 1;
+		bctx->read_ops_per_tx = 32;
+		bctx->read_tx_per_loop = 32;
+		break;
+	case vary_mut_tx_per_loop:
+		if (bctx->mut_ops_per_tx * bctx->mut_tx_per_loop ==
+		    MAX_OPS_PER_LOOP)
+		{
+			goto init_read_tx_per_loop_rw;
+		} else {
+			bctx->mut_tx_per_loop *= 2;
+		}
+		break;
+
+	init_read_tx_per_loop_rw:
+		bctx->mode = vary_read_tx_per_loop_rw;
+		printf("\n");
+		printf("vary read transactions per loop "
+		       "[ loop overhead? ]\n");
+		tsv_header();
+		bctx->mutate = 1;
+		bctx->readers = bctx->nloops - 1;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 4;
+		bctx->read_tx_per_loop = 1;
+		break;
+	case vary_read_tx_per_loop_rw:
+		if (bctx->read_ops_per_tx * bctx->read_tx_per_loop ==
+		    MAX_OPS_PER_LOOP)
+		{
+			goto init_read_tx_per_loop_ro;
+		} else {
+			bctx->read_tx_per_loop *= 2;
+		}
+		break;
+
+	init_read_tx_per_loop_ro:
+		bctx->mode = vary_read_tx_per_loop_ro;
+		printf("\n");
+		printf("vary read transactions per loop (readonly) "
+		       "[ loop overhead? ]\n");
+		tsv_header();
+		bctx->mutate = 0;
+		bctx->readers = bctx->nloops;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 4;
+		bctx->read_tx_per_loop = 1;
+		break;
+	case vary_read_tx_per_loop_ro:
+		if (bctx->read_ops_per_tx * bctx->read_tx_per_loop ==
+		    MAX_OPS_PER_LOOP)
+		{
+			goto init_read_ops_per_tx_rw;
+		} else {
+			bctx->read_tx_per_loop *= 2;
+		}
+		break;
+
+	init_read_ops_per_tx_rw:
+		bctx->mode = vary_read_ops_per_tx_rw;
+		printf("\n");
+		printf("vary read operations per transaction "
+		       " [ transaction overhead should be small ]\n");
+		tsv_header();
+		bctx->mutate = 1;
+		bctx->readers = bctx->nloops - 1;
+		bctx->mut_ops_per_tx = 4;
+		bctx->mut_tx_per_loop = 4;
+		bctx->read_ops_per_tx = 1;
+		bctx->read_tx_per_loop = MAX_OPS_PER_LOOP;
+		break;
+	case vary_read_ops_per_tx_rw:
+		if (bctx->read_ops_per_tx == MAX_OPS_PER_LOOP) {
+			goto init_read_ops_per_tx_ro;
+		} else {
+			bctx->read_ops_per_tx *= 2;
+			bctx->read_tx_per_loop /= 2;
+		}
+		break;
+
+	init_read_ops_per_tx_ro:
+		bctx->mode = vary_read_ops_per_tx_ro;
+		printf("\n");
+		printf("vary read operations per transaction (readonly) "
+		       " [ transaction overhead should be small ]\n");
+		tsv_header();
+		bctx->mutate = 0;
+		bctx->readers = bctx->nloops;
+		bctx->mut_ops_per_tx = 0;
+		bctx->mut_tx_per_loop = 0;
+		bctx->read_ops_per_tx = 1;
+		bctx->read_tx_per_loop = MAX_OPS_PER_LOOP;
+		break;
+	case vary_read_ops_per_tx_ro:
+		if (bctx->read_ops_per_tx == MAX_OPS_PER_LOOP) {
+			goto fini;
+		} else {
+			bctx->read_ops_per_tx *= 2;
+			bctx->read_tx_per_loop /= 2;
+		}
+		break;
+	}
+
+	/* dispatch a benchmark run */
+
+	bool zipf = bctx->mutate == 0 && bctx->readers == 0;
+	bctx->waiting = zipf ? bctx->nloops : bctx->readers + bctx->mutate;
+	isc_barrier_init(&bctx->barrier, bctx->waiting);
+	for (uint32_t t = 0; t < bctx->waiting; t++) {
+		bool mut = t < bctx->mutate;
+		bctx->thread[t] = (struct thread_args){
+			.bctx = bctx,
+			.barrier = &bctx->barrier,
+			.loopmgr = bctx->loopmgr,
+			.multi = bctx->multi,
+			.max_item = bctx->max_item,
+			.zipf_skew = bctx->zipf_skew,
+			.cb = zipf  ? read_zipf
+			      : mut ? mutate_transactions
+				    : read_transactions,
+			.ops_per_tx = mut ? bctx->mut_ops_per_tx
+					  : bctx->read_ops_per_tx,
+			.tx_per_loop = mut ? bctx->mut_tx_per_loop
+					   : bctx->read_tx_per_loop,
 		};
+		isc_async_run(isc_loop_get(bctx->loopmgr, t), first_loop,
+			      &bctx->thread[t]);
 	}
+}
 
-	isc_refcount_init(&stop, 0);
+static void
+collect(void *varg) {
+	TRACE("");
 
-	for (uint32_t t = 0; t < threads; t++) {
-		isc_thread_create(thread_loop, &thread[t], &thread[t].tid);
-	}
-
-	usleep(runtime);
-	isc_refcount_increment0(&stop);
-
-	for (uint32_t t = 0; t < threads; t++) {
-		isc_thread_join(thread[t].tid, NULL);
+	struct thread_args *args = varg;
+	struct bench_state *bctx = args->bctx;
+	struct thread_args *thread = bctx->thread;
+
+	bctx->waiting--;
+	if (bctx->waiting > 0) {
+		return;
 	}
+	isc_barrier_destroy(&bctx->barrier);
 
 	struct {
-		double time, txns, ops;
+		uint64_t worked, txns, ops, compactions;
 	} stats[2] = {};
 
-	for (uint32_t t = 0; t < threads; t++) {
+	double load_time = bctx->load_time;
+	load_time = load_time > 0 ? load_time / (double)NS_PER_SEC : NAN;
+
+	double elapsed = 0;
+	bool zipf = bctx->mutate == 0 && bctx->readers == 0;
+	uint32_t nloops = zipf ? bctx->nloops : bctx->readers + bctx->mutate;
+	for (uint32_t t = 0; t < nloops; t++) {
 		struct thread_args *tp = &thread[t];
-		stats[t < updaters].time += doubletime(tp->t0, tp->t1);
-		stats[t < updaters].txns += tp->transactions;
-		stats[t < updaters].ops += tp->transactions * tp->ops;
+		elapsed = ISC_MAX(elapsed, (tp->stop - tp->start));
+		bool mut = t < bctx->mutate;
+		stats[mut].worked += tp->worked;
+		stats[mut].txns += tp->transactions;
+		stats[mut].ops += tp->transactions * tp->ops_per_tx;
+		stats[mut].compactions += tp->compactions;
+	}
+
+	printf("%7.3f\t", RUNTIME / (double)NS_PER_SEC);
+	printf("%7.3f\t", elapsed / (double)NS_PER_SEC);
+	printf("%7.3f\t", load_time);
+	printf("%7.2f\t", (double)bctx->qp_bytes / bctx->qp_items);
+	printf("%7u\t", bctx->max_item);
+
+	double mut_work = stats[1].worked / (double)US_PER_MS;
+	printf("%7u\t", bctx->mutate);
+	printf("%7u\t", bctx->mut_tx_per_loop);
+	printf("%7u\t", bctx->mut_ops_per_tx);
+	printf("%7llu\t", (unsigned long long)stats[1].compactions);
+	printf("%7llu\t", (unsigned long long)stats[1].txns);
+	printf("%7llu\t", (unsigned long long)stats[1].ops);
+	printf("%7.2f\t", stats[1].worked / (double)NS_PER_SEC);
+	printf("%7.2f\t", stats[1].txns / mut_work);
+	printf("%7.2f\t", stats[1].ops / mut_work);
+
+	double readers = zipf ? bctx->nloops - bctx->mutate : bctx->readers;
+	double read_work = stats[0].worked / (double)US_PER_MS;
+	double elapsed_ms = elapsed / (double)US_PER_MS;
+	printf("%7u\t", bctx->readers);
+	printf("%7u\t", bctx->read_tx_per_loop);
+	printf("%7u\t", bctx->read_ops_per_tx);
+	printf("%7llu\t", (unsigned long long)stats[0].txns / 1000);
+	printf("%7llu\t", (unsigned long long)stats[0].ops / 1000);
+	printf("%7.2f\t", stats[0].worked / (double)NS_PER_SEC);
+	printf("%7.2f\t", stats[0].txns / read_work);
+	printf("%7.2f\t", stats[0].ops / read_work);
+	printf("%7.2f\t", stats[0].ops * readers / read_work);
+	printf("%7.2f\n", stats[0].ops / elapsed_ms);
+
+	dispatch(bctx);
+}
+
+static void
+startup(void *arg) {
+	isc_loopmgr_t *loopmgr = arg;
+	isc_loop_t *loop = isc_loop_current(loopmgr);
+	isc_mem_t *mctx = isc_loop_getmctx(loop);
+	uint32_t nloops = isc_loopmgr_nloops(loopmgr);
+
+	size_t bytes = sizeof(struct bench_state) +
+		       sizeof(struct thread_args) * nloops;
+
+	struct bench_state *bctx = isc_mem_getx(mctx, bytes, ISC_MEM_ZERO);
+
+	*bctx = (struct bench_state){
+		.loopmgr = loopmgr,
+		.bytes = bytes,
+		.nloops = nloops,
+	};
+	isc_mem_attach(mctx, &bctx->mctx);
+
+	dispatch(bctx);
+}
+
+struct ticker {
+	isc_loopmgr_t *loopmgr;
+	isc_mem_t *mctx;
+	isc_timer_t *timer;
+};
+
+static void
+tick(void *varg) {
+	/* just make the loop cycle */
+	UNUSED(varg);
+}
+
+static void
+start_ticker(void *varg) {
+	struct ticker *ticker = varg;
+	isc_loop_t *loop = isc_loop_current(ticker->loopmgr);
+
+	isc_timer_create(loop, tick, NULL, &ticker->timer);
+	isc_timer_start(ticker->timer, isc_timertype_ticker,
+			&(isc_interval_t){
+				.seconds = 0,
+				.nanoseconds = 1 * NS_PER_MS,
+			});
+}
+
+static void
+stop_ticker(void *varg) {
+	struct ticker *ticker = varg;
+
+	isc_timer_stop(ticker->timer);
+	isc_timer_destroy(&ticker->timer);
+	isc_mem_putanddetach(&ticker->mctx, ticker, sizeof(*ticker));
+}
+
+static void
+setup_tickers(isc_mem_t *mctx, isc_loopmgr_t *loopmgr) {
+	uint32_t nloops = isc_loopmgr_nloops(loopmgr);
+	for (uint32_t i = 0; i < nloops; i++) {
+		isc_loop_t *loop = isc_loop_get(loopmgr, i);
+		struct ticker *ticker = isc_mem_getx(mctx, sizeof(*ticker),
+						     ISC_MEM_ZERO);
+		isc_mem_attach(mctx, &ticker->mctx);
+		ticker->loopmgr = loopmgr;
+		isc_loop_setup(loop, start_ticker, ticker);
+		isc_loop_teardown(loop, stop_ticker, ticker);
 	}
-	printf("%2u up %2u ops/tx %7.3f txn/ms %5.3f ops/us   ", updaters,
-	       updateops,
-	       stats[1].txns / (stats[1].time * MS_PER_SEC / updaters),
-	       stats[1].ops / (stats[1].time * US_PER_SEC / updaters));
-	printf("%2u rd %2u ops/tx %8.3f txn/ms %7.3f ops/us %6.3f ops/us/thr\n",
-	       readers, readops,
-	       stats[0].txns / (stats[0].time * MS_PER_SEC / readers),
-	       stats[0].ops / (stats[0].time * US_PER_SEC / readers),
-	       stats[0].ops / (stats[0].time * US_PER_SEC));
 }
 
 int
 main(void) {
-	dns_qpmulti_t *multi = NULL;
+	isc_loopmgr_t *loopmgr = NULL;
 	isc_mem_t *mctx = NULL;
 
+	uint32_t nloops;
+	const char *env_workers = getenv("ISC_TASK_WORKERS");
+	if (env_workers != NULL) {
+		nloops = atoi(env_workers);
+	} else {
+		nloops = isc_os_ncpus();
+	}
+	INSIST(nloops > 1);
+
 	isc_mem_create(&mctx);
 	isc_mem_setdestroycheck(mctx, true);
 	init_logging(mctx);
 	init_items(mctx);
 
-	uint32_t threads = 12;
-	uint32_t max = ITEM_COUNT;
-	useconds_t runtime = 0.2 * US_PER_SEC;
-
-	init_multi(mctx, &multi, max);
-	for (uint32_t t = 2; t <= threads; t++) {
-		dispatch_threads(multi, runtime, max, 1, 64, t - 1, 8);
-	}
-	dns_qpmulti_destroy(&multi);
-
-	for (max = 1000; max <= ITEM_COUNT; max *= 10) {
-		init_multi(mctx, &multi, max);
-		for (uint32_t t = 1; t <= threads; t++) {
-			dispatch_threads(multi, runtime, max, 0, 0, t, 64);
-		}
-		dns_qpmulti_destroy(&multi);
-	}
+	isc_loopmgr_create(mctx, nloops, &loopmgr);
+	setup_tickers(mctx, loopmgr);
+	isc_loop_setup(isc_loop_main(loopmgr), startup, loopmgr);
+	isc_loopmgr_run(loopmgr);
+	isc_loopmgr_destroy(&loopmgr);
 
 	isc_log_destroy(&dns_lctx);
 	isc_mem_free(mctx, item);
+	isc_mem_checkdestroyed(stdout);
 	isc_mem_destroy(&mctx);
-	isc_mem_checkdestroyed(stderr);
 
 	return (0);
 }
diff --git a/tests/dns/qp_test.c b/tests/dns/qp_test.c
index 762859e27d..2ee05a9fdf 100644
--- a/tests/dns/qp_test.c
+++ b/tests/dns/qp_test.c
@@ -40,30 +40,30 @@ ISC_RUN_TEST_IMPL(qpkey_name) {
 	} testcases[] = {
 		{
 			.namestr = ".",
-			.key = { 0x01, 0x01 },
+			.key = { 0x02, 0x02 },
 			.len = 1,
 		},
 		{
 			.namestr = "\\000",
-			.key = { 0x02, 0x02, 0x01, 0x01 },
+			.key = { 0x03, 0x03, 0x02, 0x02 },
 			.len = 3,
 		},
 		{
 			.namestr = "example.com.",
-			.key = { 0x01, 0x15, 0x21, 0x1f, 0x01, 0x17, 0x2a, 0x13,
-				 0x1f, 0x22, 0x1e, 0x17, 0x01, 0x01 },
+			.key = { 0x02, 0x16, 0x22, 0x20, 0x02, 0x18, 0x2b, 0x14,
+				 0x20, 0x23, 0x1f, 0x18, 0x02, 0x02 },
 			.len = 13,
 		},
 		{
 			.namestr = "example.com",
-			.key = { 0x15, 0x21, 0x1f, 0x01, 0x17, 0x2a, 0x13, 0x1f,
-				 0x22, 0x1e, 0x17, 0x01, 0x01 },
+			.key = { 0x16, 0x22, 0x20, 0x02, 0x18, 0x2b, 0x14, 0x20,
+				 0x23, 0x1f, 0x18, 0x02, 0x02 },
 			.len = 12,
 		},
 		{
 			.namestr = "EXAMPLE.COM",
-			.key = { 0x15, 0x21, 0x1f, 0x01, 0x17, 0x2a, 0x13, 0x1f,
-				 0x22, 0x1e, 0x17, 0x01, 0x01 },
+			.key = { 0x16, 0x22, 0x20, 0x02, 0x18, 0x2b, 0x14, 0x20,
+				 0x23, 0x1f, 0x18, 0x02, 0x02 },
 			.len = 12,
 		},
 	};
@@ -78,8 +78,8 @@ ISC_RUN_TEST_IMPL(qpkey_name) {
 		in = dns_fixedname_name(&fn1);
 		len = dns_qpkey_fromname(key, in);
 
-		assert_true(testcases[i].len == len);
-		assert_true(memcmp(testcases[i].key, key, len) == 0);
+		assert_int_equal(testcases[i].len, len);
+		assert_memory_equal(testcases[i].key, key, len);
 
 		out = dns_fixedname_initname(&fn2);
 		qp_test_keytoname(key, out);
diff --git a/tests/dns/qpmulti_test.c b/tests/dns/qpmulti_test.c
index e176c17093..dede4951ba 100644
--- a/tests/dns/qpmulti_test.c
+++ b/tests/dns/qpmulti_test.c
@@ -22,8 +22,12 @@
 #define UNIT_TESTING
 #include <cmocka.h>
 
+#include <isc/assertions.h>
 #include <isc/log.h>
+#include <isc/loop.h>
+#include <isc/magic.h>
 #include <isc/mem.h>
+#include <isc/qsbr.h>
 #include <isc/random.h>
 #include <isc/refcount.h>
 #include <isc/rwlock.h>
@@ -44,11 +48,10 @@
 #define TRANSACTION_COUNT 1234
 
 #if VERBOSE
-#define TRACE(fmt, ...)                                                       \
-	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP,   \
-		      ISC_LOG_DEBUG(7), "%s:%d:%s: " fmt, __FILE__, __LINE__, \
-		      __func__, ##__VA_ARGS__)
-
+#define TRACE(fmt, ...)                                                     \
+	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \
+		      ISC_LOG_DEBUG(7), "%s:%d:%s(): " fmt, __FILE__,       \
+		      __LINE__, __func__, ##__VA_ARGS__)
 #else
 #define TRACE(...)
 #endif
@@ -110,8 +113,8 @@ item_attach(void *ctx, void *pval, uint32_t ival) {
 
 static void
 item_detach(void *ctx, void *pval, uint32_t ival) {
-	INSIST(ctx == NULL);
-	INSIST(pval == &item[ival]);
+	assert_null(ctx);
+	assert_ptr_equal(pval, &item[ival]);
 	item[ival].refcount--;
 }
 
@@ -124,11 +127,11 @@ item_makekey(dns_qpkey_t key, void *ctx, void *pval, uint32_t ival) {
 	if (!(ival < ARRAY_SIZE(item) && lo <= ip && ip < hi &&
 	      pval == &item[ival]))
 	{
-		TRACE("ival %u pval %lx", ival, ip);
-		INSIST(ival < ARRAY_SIZE(item));
-		INSIST(ip >= lo);
-		INSIST(ip < hi);
-		INSIST(pval == &item[ival]);
+		ISC_INSIST(ival < ARRAY_SIZE(item));
+		ISC_INSIST(pval != NULL);
+		ISC_INSIST(ip >= lo);
+		ISC_INSIST(ip < hi);
+		ISC_INSIST(pval == &item[ival]);
 	}
 	memmove(key, item[ival].key, item[ival].len);
 	return (item[ival].len);
@@ -182,13 +185,13 @@ checkkey(dns_qpreadable_t qpr, size_t i, bool exists, const char *rubric) {
 	isc_result_t result;
 	result = dns_qp_getkey(qpr, item[i].key, item[i].len, &pval, &ival);
 	if (result == ISC_R_SUCCESS) {
-		ASSERT(exists == true);
-		ASSERT(pval == &item[i]);
-		ASSERT(ival == i);
+		assert_true(exists);
+		assert_ptr_equal(pval, &item[i]);
+		assert_int_equal(ival, i);
 	} else if (result == ISC_R_NOTFOUND) {
-		ASSERT(exists == false);
-		ASSERT(pval == NULL);
-		ASSERT(ival == ~0U);
+		assert_false(exists);
+		assert_null(pval);
+		assert_int_equal(ival, ~0U);
 	} else {
 		UNREACHABLE();
 	}
@@ -232,9 +235,9 @@ one_transaction(dns_qpmulti_t *qpm) {
 	isc_result_t result;
 	bool ok = true;
 
-	dns_qpreadable_t qpo = (dns_qpreadable_t)(dns_qp_t *)NULL;
-	dns_qpread_t *qpr = NULL;
+	dns_qpreader_t *qpo = NULL;
 	dns_qpsnap_t *qps = NULL;
+	dns_qpread_t qpr = { 0 };
 	dns_qp_t *qpw = NULL;
 
 	bool snap = isc_random_uniform(2) == 0;
@@ -242,7 +245,7 @@ one_transaction(dns_qpmulti_t *qpm) {
 	bool rollback = update && isc_random_uniform(4) == 0;
 	size_t count = isc_random_uniform(TRANSACTION_SIZE);
 
-	TRACE("transaction %s %s %s %zu", snap ? "snapshot" : "query",
+	TRACE("transaction %s %s %s size %zu", snap ? "snapshot" : "query",
 	      update ? "update" : "write", rollback ? "rollback" : "commit",
 	      count);
 
@@ -255,7 +258,7 @@ one_transaction(dns_qpmulti_t *qpm) {
 	/* briefly take and drop mutex */
 	if (snap) {
 		dns_qpmulti_snapshot(qpm, &qps);
-		qpo = (dns_qpreadable_t)qps;
+		qpo = (dns_qpreader_t *)qps;
 	}
 
 	/* take mutex */
@@ -265,10 +268,9 @@ one_transaction(dns_qpmulti_t *qpm) {
 		dns_qpmulti_write(qpm, &qpw);
 	}
 
-	/* take rwlock */
 	if (!snap) {
 		dns_qpmulti_query(qpm, &qpr);
-		qpo = (dns_qpreadable_t)qpr;
+		qpo = (dns_qpreader_t *)&qpr;
 	}
 
 	for (size_t n = 0; n < count; n++) {
@@ -278,15 +280,15 @@ one_transaction(dns_qpmulti_t *qpm) {
 		ASSERT(checkkey(qpw, i, item[i].in_rw, "before rw"));
 
 		if (item[i].in_rw) {
-			/* TRACE("delete %zu %.*s", i, item[i].len,
-				 item[i].ascii); */
+			/* TRACE("delete %zu %.*s", i,
+				 item[i].len, item[i].ascii); */
 			result = dns_qp_deletekey(qpw, item[i].key,
 						  item[i].len);
 			ASSERT(result == ISC_R_SUCCESS);
 			item[i].in_rw = false;
 		} else {
-			/* TRACE("insert %zu %.*s", i, item[i].len,
-				 item[i].ascii); */
+			/* TRACE("insert %zu %.*s", i,
+				 item[i].len, item[i].ascii); */
 			result = dns_qp_insert(qpw, &item[i], i);
 			ASSERT(result == ISC_R_SUCCESS);
 			item[i].in_rw = true;
@@ -307,7 +309,6 @@ one_transaction(dns_qpmulti_t *qpm) {
 	assert_true(checkallrw(qpw));
 
 	if (!snap) {
-		/* drop the rwlock so the commit can take it */
 		dns_qpread_destroy(qpm, &qpr);
 	}
 
@@ -322,7 +323,7 @@ one_transaction(dns_qpmulti_t *qpm) {
 						"rollback ro"));
 			}
 			item[i].in_rw = item[i].in_ro;
-			ASSERT(checkkey(qpr, i, item[i].in_rw, "rollback rw"));
+			ASSERT(checkkey(&qpr, i, item[i].in_rw, "rollback rw"));
 		}
 		dns_qpread_destroy(qpm, &qpr);
 	} else {
@@ -336,7 +337,7 @@ one_transaction(dns_qpmulti_t *qpm) {
 						"commit ro"));
 			}
 			item[i].in_ro = item[i].in_rw;
-			ASSERT(checkkey(qpr, i, item[i].in_rw, "commit rw"));
+			ASSERT(checkkey(&qpr, i, item[i].in_rw, "commit rw"));
 		}
 		dns_qpread_destroy(qpm, &qpr);
 	}
@@ -347,28 +348,45 @@ one_transaction(dns_qpmulti_t *qpm) {
 		dns_qpsnap_destroy(qpm, &qps);
 	}
 
+	TRACE("completed %s %s %s size %zu", snap ? "snapshot" : "query",
+	      update ? "update" : "write", rollback ? "rollback" : "commit",
+	      count);
+
 	if (!ok) {
 		TRACE("transaction failed");
 		dns_qpmulti_query(qpm, &qpr);
-		qp_test_dumptrie(qpr);
+		qp_test_dumptrie(&qpr);
 		dns_qpread_destroy(qpm, &qpr);
 	}
 	assert_true(ok);
 }
 
-ISC_RUN_TEST_IMPL(qpmulti) {
-	setup_logging();
-	setup_items();
+static void
+many_transactions(void *arg) {
+	UNUSED(arg);
 
 	dns_qpmulti_t *qpm = NULL;
-	dns_qpmulti_create(mctx, &test_methods, NULL, &qpm);
+	dns_qpmulti_create(mctx, loopmgr, &test_methods, NULL, &qpm);
+	qpm->writer.write_protect = true;
 
 	for (size_t n = 0; n < TRANSACTION_COUNT; n++) {
+		TRACE("transaction %zu", n);
 		one_transaction(qpm);
+		isc__qsbr_quiescent_state(isc_loop_current(loopmgr));
+		isc_loopmgr_wakeup(loopmgr);
 	}
 
 	dns_qpmulti_destroy(&qpm);
+	isc_loopmgr_shutdown(loopmgr);
+}
 
+ISC_RUN_TEST_IMPL(qpmulti) {
+	setup_loopmgr(NULL);
+	setup_logging();
+	setup_items();
+	isc_loop_setup(isc_loop_main(loopmgr), many_transactions, NULL);
+	isc_loopmgr_run(loopmgr);
+	isc_loopmgr_destroy(&loopmgr);
 	isc_log_destroy(&dns_lctx);
 }
 
diff --git a/tests/libtest/qp.c b/tests/libtest/qp.c
index 1dfabd77c8..0b3e11e001 100644
--- a/tests/libtest/qp.c
+++ b/tests/libtest/qp.c
@@ -16,7 +16,9 @@
 #include <stdio.h>
 
 #include <isc/buffer.h>
+#include <isc/loop.h>
 #include <isc/magic.h>
+#include <isc/qsbr.h>
 #include <isc/refcount.h>
 #include <isc/rwlock.h>
 #include <isc/util.h>
@@ -136,7 +138,7 @@ qp_test_keytoname(const dns_qpkey_t key, dns_name_t *name) {
 
 static size_t
 getheight(dns_qp_t *qp, qp_node_t *n) {
-	if (!is_branch(n)) {
+	if (node_tag(n) == LEAF_TAG) {
 		return (0);
 	}
 	size_t max_height = 0;
@@ -151,18 +153,15 @@ getheight(dns_qp_t *qp, qp_node_t *n) {
 
 size_t
 qp_test_getheight(dns_qp_t *qp) {
-	return (getheight(qp, &qp->root));
+	qp_node_t *root = get_root(qp);
+	return (root == NULL ? 0 : getheight(qp, root));
 }
 
 static size_t
 maxkeylen(dns_qp_t *qp, qp_node_t *n) {
-	if (!is_branch(n)) {
-		if (leaf_pval(n) == NULL) {
-			return (0);
-		} else {
-			dns_qpkey_t key;
-			return (leaf_qpkey(qp, n, key));
-		}
+	if (node_tag(n) == LEAF_TAG) {
+		dns_qpkey_t key;
+		return (leaf_qpkey(qp, n, key));
 	}
 	size_t max_len = 0;
 	qp_weight_t size = branch_twigs_size(n);
@@ -176,7 +175,8 @@ maxkeylen(dns_qp_t *qp, qp_node_t *n) {
 
 size_t
 qp_test_maxkeylen(dns_qp_t *qp) {
-	return (maxkeylen(qp, &qp->root));
+	qp_node_t *root = get_root(qp);
+	return (root == NULL ? 0 : maxkeylen(qp, root));
 }
 
 /***********************************************************************
@@ -186,8 +186,9 @@ qp_test_maxkeylen(dns_qp_t *qp) {
 
 static void
 dumpread(dns_qpreadable_t qpr, const char *type, const char *tail) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
-	printf("%s %p root %p base %p methods %p%s", type, qp, &qp->root,
+	dns_qpreader_t *qp = dns_qpreader(qpr);
+	printf("%s %p root %u %u:%u base %p methods %p%s", type, qp,
+	       qp->root_ref, ref_chunk(qp->root_ref), ref_cell(qp->root_ref),
 	       qp->base, qp->methods, tail);
 }
 
@@ -195,17 +196,14 @@ static void
 dumpqp(dns_qp_t *qp, const char *type) {
 	dumpread(qp, type, " mctx ");
 	printf("%p\n", qp->mctx);
-	printf("%s %p usage %p generation %u "
-	       "chunk_max %u bump %u fender %u\n",
-	       type, qp, qp->usage, qp->generation, qp->chunk_max, qp->bump,
-	       qp->fender);
+	printf("%s %p usage %p chunk_max %u bump %u fender %u\n", type, qp,
+	       qp->usage, qp->chunk_max, qp->bump, qp->fender);
 	printf("%s %p leaf %u live %u used %u free %u hold %u\n", type, qp,
 	       qp->leaf_count, qp->used_count - qp->free_count, qp->used_count,
 	       qp->free_count, qp->hold_count);
-	printf("%s %p compact_all=%d shared_arrays=%d"
-	       " transaction_mode=%d write_protect=%d\n",
-	       type, qp, qp->compact_all, qp->shared_arrays,
-	       qp->transaction_mode, qp->write_protect);
+	printf("%s %p compact_all=%d transaction_mode=%d write_protect=%d\n",
+	       type, qp, qp->compact_all, qp->transaction_mode,
+	       qp->write_protect);
 }
 
 void
@@ -229,10 +227,19 @@ qp_test_dumpqp(dns_qp_t *qp) {
 
 void
 qp_test_dumpmulti(dns_qpmulti_t *multi) {
-	dumpqp(&multi->phase[0], "qpmulti->phase[0]");
-	dumpqp(&multi->phase[1], "qpmulti->phase[1]");
-	printf("qpmulti %p read %p snapshots %u\n", &multi, multi->read,
-	       multi->snapshots);
+	dns_qpreader_t qpr;
+	qp_node_t *reader = atomic_load(&multi->reader);
+	dns_qpmulti_t *whence = unpack_reader(&qpr, reader);
+	dumpqp(&multi->writer, "qpmulti->writer");
+	printf("qpmulti->reader %p root_ref %u %u:%u base %p\n", reader,
+	       qpr.root_ref, ref_chunk(qpr.root_ref), ref_cell(qpr.root_ref),
+	       qpr.base);
+	printf("qpmulti->reader %p whence %p\n", reader, whence);
+	unsigned int snapshots = 0;
+	for (dns_qpsnap_t *snap = ISC_LIST_HEAD(multi->snapshots); //
+	     snap != NULL; snap = ISC_LIST_NEXT(snap, link), snapshots++)
+	{}
+	printf("qpmulti %p snapshots %u\n", multi, snapshots);
 	fflush(stdout);
 }
 
@@ -242,9 +249,11 @@ qp_test_dumpchunks(dns_qp_t *qp) {
 	qp_cell_t free = 0;
 	dumpqp(qp, "qp");
 	for (qp_chunk_t c = 0; c < qp->chunk_max; c++) {
-		printf("qp %p chunk %u base %p used %u free %u generation %u\n",
-		       qp, c, qp->base[c], qp->usage[c].used, qp->usage[c].free,
-		       qp->usage[c].generation);
+		printf("qp %p chunk %u base %p "
+		       "used %u free %u immutable %u phase %u\n",
+		       qp, c, qp->base->ptr[c], qp->usage[c].used,
+		       qp->usage[c].free, qp->usage[c].immutable,
+		       qp->usage[c].phase);
 		used += qp->usage[c].used;
 		free += qp->usage[c].free;
 	}
@@ -254,7 +263,7 @@ qp_test_dumpchunks(dns_qp_t *qp) {
 
 void
 qp_test_dumptrie(dns_qpreadable_t qpr) {
-	dns_qpread_t *qp = dns_qpreadable_cast(qpr);
+	dns_qpreader_t *qp = dns_qpreader(qpr);
 	struct {
 		qp_ref_t ref;
 		qp_shift_t max, pos;
@@ -267,11 +276,18 @@ qp_test_dumptrie(dns_qpreadable_t qpr) {
 	 * node; the ref is deliberately out of bounds, and pos == max
 	 * so we will immediately stop scanning it
 	 */
-	stack[sp].ref = ~0U;
+	stack[sp].ref = INVALID_REF;
 	stack[sp].max = 0;
 	stack[sp].pos = 0;
-	qp_node_t *n = &qp->root;
-	printf("%p ROOT\n", n);
+
+	qp_node_t *n = get_root(qp);
+	if (n == NULL) {
+		printf("%p EMPTY\n", n);
+		fflush(stdout);
+		return;
+	} else {
+		printf("%p ROOT qp %p base %p\n", n, qp, qp->base);
+	}
 
 	for (;;) {
 		if (is_branch(n)) {
@@ -291,25 +307,20 @@ qp_test_dumptrie(dns_qpreadable_t qpr) {
 			}
 			assert(len == max);
 			qp_test_keytoascii(bits, len);
-			printf("%*s%p BRANCH %p %d %zu %s\n", (int)sp * 2, "",
-			       n, twigs, ref, branch_key_offset(n), bits);
+			printf("%*s%p BRANCH %p %u %u:%u %zu %s\n", (int)sp * 2,
+			       "", n, twigs, ref, ref_chunk(ref), ref_cell(ref),
+			       branch_key_offset(n), bits);
 
 			++sp;
 			stack[sp].ref = ref;
 			stack[sp].max = max;
 			stack[sp].pos = 0;
 		} else {
-			if (leaf_pval(n) != NULL) {
-				dns_qpkey_t key;
-				qp_test_keytoascii(key, leaf_qpkey(qp, n, key));
-				printf("%*s%p LEAF %p %d %s\n", (int)sp * 2, "",
-				       n, leaf_pval(n), leaf_ival(n), key);
-				leaf_count++;
-			} else {
-				assert(n == &qp->root);
-				assert(leaf_count == 0);
-				printf("%p EMPTY", n);
-			}
+			dns_qpkey_t key;
+			qp_test_keytoascii(key, leaf_qpkey(qp, n, key));
+			printf("%*s%p LEAF %p %d %s\n", (int)sp * 2, "", n,
+			       leaf_pval(n), leaf_ival(n), key);
+			leaf_count++;
 		}
 
 		while (stack[sp].pos == stack[sp].max) {
@@ -328,7 +339,9 @@ qp_test_dumptrie(dns_qpreadable_t qpr) {
 
 static void
 dumpdot_name(qp_node_t *n) {
-	if (is_branch(n)) {
+	if (n == NULL) {
+		printf("empty");
+	} else if (is_branch(n)) {
 		qp_ref_t ref = branch_twigs_ref(n);
 		printf("c%dn%d", ref_chunk(ref), ref_cell(ref));
 	} else {
@@ -338,7 +351,9 @@ dumpdot_name(qp_node_t *n) {
 
 static void
 dumpdot_twig(dns_qp_t *qp, qp_node_t *n) {
-	if (is_branch(n)) {
+	if (n == NULL) {
+		printf("empty [shape=oval, label=\"\\N EMPTY\"];\n");
+	} else if (is_branch(n)) {
 		dumpdot_name(n);
 		printf(" [shape=record, label=\"{ \\N\\noff %zu | ",
 		       branch_key_offset(n));
@@ -370,11 +385,7 @@ dumpdot_twig(dns_qp_t *qp, qp_node_t *n) {
 	} else {
 		dns_qpkey_t key;
 		const char *str;
-		if (leaf_pval(n) == NULL) {
-			str = "EMPTY";
-		} else {
-			str = qp_test_keytoascii(key, leaf_qpkey(qp, n, key));
-		}
+		str = qp_test_keytoascii(key, leaf_qpkey(qp, n, key));
 		printf("v%p [shape=oval, label=\"\\N ival %d\\n%s\"];\n",
 		       leaf_pval(n), leaf_ival(n), str);
 	}
@@ -383,7 +394,7 @@ dumpdot_twig(dns_qp_t *qp, qp_node_t *n) {
 void
 qp_test_dumpdot(dns_qp_t *qp) {
 	REQUIRE(QP_VALID(qp));
-	qp_node_t *n = &qp->root;
+	qp_node_t *n = get_root(qp);
 	printf("strict digraph {\nrankdir = \"LR\"; ranksep = 1.0;\n");
 	printf("ROOT [shape=point]; ROOT -> ");
 	dumpdot_name(n);