bind/lib/dns/qp.c

/*
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
 *
 * SPDX-License-Identifier: MPL-2.0
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
 */

/*
 * For an overview, see doc/design/qp-trie.md
 */

#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

#if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
#include <sys/mman.h>
#include <unistd.h>
#endif

#include <isc/atomic.h>
#include <isc/buffer.h>
#include <isc/magic.h>
#include <isc/mem.h>
#include <isc/mutex.h>
#include <isc/refcount.h>
#include <isc/result.h>
#include <isc/rwlock.h>
#include <isc/tid.h>
#include <isc/time.h>
#include <isc/types.h>
#include <isc/urcu.h>
#include <isc/util.h>

#include <dns/fixedname.h>
#include <dns/log.h>
#include <dns/name.h>
#include <dns/qp.h>
#include <dns/types.h>

#include "qp_p.h"

#ifndef DNS_QP_LOG_STATS
#define DNS_QP_LOG_STATS 1
#endif
#ifndef DNS_QP_TRACE
#define DNS_QP_TRACE 0
#endif

/*
 * very basic garbage collector statistics
 *
 * XXXFANF for now we're logging GC times, but ideally we should
 * accumulate stats more quietly and report via the statschannel
 */
static atomic_uint_fast64_t compact_time;
static atomic_uint_fast64_t recycle_time;
static atomic_uint_fast64_t rollback_time;

/* for LOG_STATS() format strings */
#define PRItime " %" PRIu64 " ns "

#if DNS_QP_LOG_STATS
#define LOG_STATS(...)                                                      \
	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \
		      ISC_LOG_DEBUG(1), __VA_ARGS__)
#else
#define LOG_STATS(...)
#endif

#if DNS_QP_TRACE
/*
 * TRACE is generally used in allocation-related functions so it doesn't
 * trace very high-frequency ops
 */
#define TRACE(fmt, ...)                                                        \
	do {                                                                   \
		if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) {            \
			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,      \
				      DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7),      \
				      "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \
				      __FILE__, __LINE__, __func__, qp,        \
				      qp ? TRIENAME(qp) : "(null)", isc_tid(), \
				      ##__VA_ARGS__);                          \
		}                                                              \
	} while (0)
#else
#define TRACE(...)
#endif

/***********************************************************************
 *
 *  converting DNS names to trie keys
 */

/*
 * Number of distinct byte values, i.e. 256
 */
#define BYTE_VALUES (UINT8_MAX + 1)

/*
 * Lookup table mapping bytes in DNS names to bit positions, used
 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
 *
 * Each element holds one or two bit positions, bit_one in the
 * lower half and bit_two in the upper half.
 *
 * For common hostname characters, bit_two is zero (which cannot
 * be a valid bit position).
 *
 * For others, bit_one is the escape bit, and bit_two is the
 * position of the character within the escaped range.
 */
uint16_t dns_qp_bits_for_byte[BYTE_VALUES] = { 0 };

/*
 * And the reverse, mapping bit positions to characters, so the tests
 * can print diagnostics involving qp-trie keys.
 *
 * This table only handles the first bit in an escape sequence; we
 * arrange that we can calculate the byte value for both bits by
 * adding the the second bit to the first bit's byte value.
 */
uint8_t dns_qp_byte_for_bit[SHIFT_OFFSET] = { 0 };

/*
 * Fill in the lookup tables at program startup. (It doesn't matter
 * when this is initialized relative to other startup code.)
 */
static void
initialize_bits_for_byte(void) ISC_CONSTRUCTOR;

/*
 * The bit positions for bytes inside labels have to be between
 * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.)
 *
 * Each byte range in between common hostname characters has a different
 * escape character, to preserve the correct lexical order.
 *
 * Escaped byte ranges mostly fit into the space available in the
 * bitmap, except for those above 'z' (which is mostly bytes with the
 * top bit set). So, when we reach the end of the bitmap we roll over
 * to the next escape character.
 *
 * After filling the table we ensure that the bit positions for
 * hostname characters and escape characters all fit.
 */
static void
initialize_bits_for_byte(void) {
	/* zero common character marker not a valid shift position */
	INSIST(0 < SHIFT_BITMAP);
	/* first bit is common byte or escape byte */
	dns_qpshift_t bit_one = SHIFT_BITMAP;
	/* second bit is position in escaped range */
	dns_qpshift_t bit_two = SHIFT_BITMAP;
	bool escaping = true;

	for (unsigned int byte = 0; byte < BYTE_VALUES; byte++) {
		if (qp_common_character(byte)) {
			escaping = false;
			bit_one++;
			dns_qp_byte_for_bit[bit_one] = byte;
			dns_qp_bits_for_byte[byte] = bit_one;
		} else if ('A' <= byte && byte <= 'Z') {
			/* map upper case to lower case */
			dns_qpshift_t after_esc = bit_one + 1;
			dns_qpshift_t skip_punct = 'a' - '_';
			dns_qpshift_t letter = byte - 'A';
			dns_qpshift_t bit = after_esc + skip_punct + letter;
			dns_qp_bits_for_byte[byte] = bit;
			/* to simplify reverse conversion */
			bit_two++;
		} else {
			/* non-hostname characters need to be escaped */
			if (!escaping || bit_two >= SHIFT_OFFSET) {
				escaping = true;
				bit_one++;
				dns_qp_byte_for_bit[bit_one] = byte;
				bit_two = SHIFT_BITMAP;
			}
			dns_qp_bits_for_byte[byte] = bit_two << 8 | bit_one;
			bit_two++;
		}
	}
	ENSURE(bit_one < SHIFT_OFFSET);
}

/*
 * Convert a DNS name into a trie lookup key.
 *
 * Returns the length of the key.
 *
 * For performance we get our hands dirty in the guts of the name.
 *
 * We don't worry about the distinction between absolute and relative
 * names. When the trie is only used with absolute names, the first byte
 * of the key will always be SHIFT_NOBYTE and it will always be skipped
 * when traversing the trie. So keeping the root label costs little, and
 * it allows us to support tries of relative names too. In fact absolute
 * and relative names can be mixed in the same trie without causing
 * confusion, because the presence or absence of the initial
 * SHIFT_NOBYTE in the key disambiguates them (exactly like a trailing
 * dot in a zone file).
 */
size_t
dns_qpkey_fromname(dns_qpkey_t key, const dns_name_t *name) {
	size_t len, label;
	dns_fixedname_t fixed;

	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));

	if (name->labels == 0) {
		key[0] = SHIFT_NOBYTE;
		return (0);
	}

	if (name->offsets == NULL) {
		dns_name_t *clone = dns_fixedname_initname(&fixed);
		dns_name_clone(name, clone);
		name = clone;
	}

	len = 0;
	label = name->labels;
	while (label-- > 0) {
		const uint8_t *ldata = name->ndata + name->offsets[label];
		size_t label_len = *ldata++;
		while (label_len-- > 0) {
			uint16_t bits = dns_qp_bits_for_byte[*ldata++];
			key[len++] = bits & 0xFF;	/* bit_one */
			if ((bits >> 8) != 0) {		/* escape? */
				key[len++] = bits >> 8; /* bit_two */
			}
		}
		/* label terminator */
		key[len++] = SHIFT_NOBYTE;
	}
	/* mark end with a double NOBYTE */
	key[len] = SHIFT_NOBYTE;
	ENSURE(len < sizeof(dns_qpkey_t));
	return (len);
}

void
dns_qpkey_toname(const dns_qpkey_t key, size_t keylen, dns_name_t *name) {
	size_t locs[DNS_NAME_MAXLABELS];
	size_t loc = 0, opos = 0;
	size_t offset;

	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
	REQUIRE(name->buffer != NULL);
	REQUIRE(name->offsets != NULL);

	if (keylen == 0) {
		dns_name_reset(name);
		return;
	}

	isc_buffer_clear(name->buffer);

	/* Scan the key looking for label boundaries */
	for (offset = 0; offset <= keylen; offset++) {
		INSIST(key[offset] >= SHIFT_NOBYTE &&
		       key[offset] < SHIFT_OFFSET);
		INSIST(loc < DNS_NAME_MAXLABELS);
		if (qpkey_bit(key, keylen, offset) == SHIFT_NOBYTE) {
			if (qpkey_bit(key, keylen, offset + 1) == SHIFT_NOBYTE)
			{
				locs[loc] = offset + 1;
				goto scanned;
			}
			locs[loc++] = offset + 1;
		} else if (offset == 0) {
			/* This happens for a relative name */
			locs[loc++] = offset;
		}
	}
	UNREACHABLE();
scanned:

	/*
	 * In the key the labels are encoded in reverse order, so
	 * we step backward through the label boundaries, then forward
	 * through the labels, to create the DNS wire format data.
	 */
	name->labels = loc;
	while (loc-- > 0) {
		uint8_t len = 0, *lenp = NULL;

		/* Add a length byte to the name data and set an offset */
		lenp = isc_buffer_used(name->buffer);
		isc_buffer_putuint8(name->buffer, 0);
		name->offsets[opos++] = name->length++;

		/* Convert from escaped byte ranges to ASCII */
		for (offset = locs[loc]; offset < locs[loc + 1] - 1; offset++) {
			uint8_t bit = qpkey_bit(key, keylen, offset);
			uint8_t byte = dns_qp_byte_for_bit[bit];
			if (qp_common_character(byte)) {
				isc_buffer_putuint8(name->buffer, byte);
			} else {
				byte += key[++offset] - SHIFT_BITMAP;
				isc_buffer_putuint8(name->buffer, byte);
			}
			len++;
		}

		name->length += len;
		*lenp = len;
	}

	/* Add a root label for absolute names */
	if (key[0] == SHIFT_NOBYTE) {
		name->attributes.absolute = true;
		isc_buffer_putuint8(name->buffer, 0);
		name->offsets[opos++] = name->length++;
		name->labels++;
	}

	name->ndata = isc_buffer_base(name->buffer);
}

/*
 * Sentinel value for equal keys
 */
#define QPKEY_EQUAL (~(size_t)0)

/*
 * Compare two keys and return the offset where they differ.
 *
 * This offset is used to work out where a trie search diverged: when one
 * of the keys is in the trie and one is not, the common prefix (up to the
 * offset) is the part of the unknown key that exists in the trie. This
 * matters for adding new keys or finding neighbours of missing keys.
 *
 * When the keys are different lengths it is possible (but unwise) for
 * the longer key to be the same as the shorter key but with superfluous
 * trailing SHIFT_NOBYTE elements. This makes the keys equal for the
 * purpose of traversing the trie.
 */
static size_t
qpkey_compare(const dns_qpkey_t key_a, const size_t keylen_a,
	      const dns_qpkey_t key_b, const size_t keylen_b) {
	size_t keylen = ISC_MAX(keylen_a, keylen_b);
	for (size_t offset = 0; offset < keylen; offset++) {
		if (qpkey_bit(key_a, keylen_a, offset) !=
		    qpkey_bit(key_b, keylen_b, offset))
		{
			return (offset);
		}
	}
	return (QPKEY_EQUAL);
}

/***********************************************************************
 *
 *  allocator wrappers
 */

#if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION

/*
 * Optionally (for debugging) during a copy-on-write transaction, use
 * memory protection to ensure that the shared chunks are not modified.
 * Once a chunk becomes shared, it remains read-only until it is freed.
 * POSIX says we have to use mmap() to get an allocation that we can
 * definitely pass to mprotect().
 */

static size_t
chunk_size_raw(void) {
	size_t size = (size_t)sysconf(_SC_PAGE_SIZE);
	return (ISC_MAX(size, QP_CHUNK_BYTES));
}

static void *
chunk_get_raw(dns_qp_t *qp) {
	if (qp->write_protect) {
		size_t size = chunk_size_raw();
		void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
				 MAP_ANON | MAP_PRIVATE, -1, 0);
		RUNTIME_CHECK(ptr != MAP_FAILED);
		return (ptr);
	} else {
		return (isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES));
	}
}

static void
chunk_free_raw(dns_qp_t *qp, void *ptr) {
	if (qp->write_protect) {
		RUNTIME_CHECK(munmap(ptr, chunk_size_raw()) == 0);
	} else {
		isc_mem_free(qp->mctx, ptr);
	}
}

static void *
chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) {
	if (qp->write_protect) {
		return (ptr);
	} else {
		return (isc_mem_reallocate(qp->mctx, ptr, bytes));
	}
}

static void
write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) {
	if (qp->write_protect) {
		/* see transaction_open() wrt this special case */
		if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) {
			return;
		}
		TRACE("chunk %u", chunk);
		void *ptr = qp->base->ptr[chunk];
		size_t size = chunk_size_raw();
		RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0);
	}
}

#else

#define chunk_get_raw(qp)	isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES)
#define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr)

#define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size)

#define write_protect(qp, chunk)

#endif

/***********************************************************************
 *
 *  allocator
 */

/*
 * When we reuse the bump chunk across multiple write transactions,
 * it can have an immutable prefix and a mutable suffix.
 */
static inline bool
cells_immutable(dns_qp_t *qp, dns_qpref_t ref) {
	dns_qpchunk_t chunk = ref_chunk(ref);
	dns_qpcell_t cell = ref_cell(ref);
	if (chunk == qp->bump) {
		return (cell < qp->fender);
	} else {
		return (qp->usage[chunk].immutable);
	}
}

/*
 * Create a fresh bump chunk and allocate some twigs from it.
 */
static dns_qpref_t
chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) {
	INSIST(qp->base->ptr[chunk] == NULL);
	INSIST(qp->usage[chunk].used == 0);
	INSIST(qp->usage[chunk].free == 0);

	qp->base->ptr[chunk] = chunk_get_raw(qp);
	qp->usage[chunk] = (qp_usage_t){ .exists = true, .used = size };
	qp->used_count += size;
	qp->bump = chunk;
	qp->fender = 0;

	if (qp->write_protect) {
		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
	}
	return (make_ref(chunk, 0));
}

/*
 * This is used to grow the chunk arrays when they fill up. If the old
 * base array is in use by readers, we must make a clone, otherwise we
 * can reallocate in place.
 *
 * The isc_refcount_init() and qpbase_unref() in this function are a pair.
 */
static void
realloc_chunk_arrays(dns_qp_t *qp, dns_qpchunk_t newmax) {
	size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max;
	size_t newptrs = sizeof(qp->base->ptr[0]) * newmax;
	size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax);

	if (qp->base == NULL || qpbase_unref(qp)) {
		qp->base = isc_mem_reallocate(qp->mctx, qp->base, size);
	} else {
		dns_qpbase_t *oldbase = qp->base;
		qp->base = isc_mem_allocate(qp->mctx, size);
		memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs);
	}
	memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs);
	isc_refcount_init(&qp->base->refcount, 1);
	qp->base->magic = QPBASE_MAGIC;

	/* usage array is exclusive to the writer */
	size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max;
	size_t newusage = sizeof(qp->usage[0]) * newmax;
	qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage);
	memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage);

	qp->chunk_max = newmax;

	TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max);
}

/*
 * There was no space in the bump chunk, so find a place to put a fresh
 * chunk in the chunk arrays, then allocate some twigs from it.
 */
static dns_qpref_t
alloc_slow(dns_qp_t *qp, dns_qpweight_t size) {
	dns_qpchunk_t chunk;

	for (chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (!qp->usage[chunk].exists) {
			return (chunk_alloc(qp, chunk, size));
		}
	}
	ENSURE(chunk == qp->chunk_max);
	realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk));
	return (chunk_alloc(qp, chunk, size));
}

/*
 * Ensure we are using a fresh bump chunk.
 */
static void
alloc_reset(dns_qp_t *qp) {
	(void)alloc_slow(qp, 0);
}

/*
 * Allocate some fresh twigs. This is the bump allocator fast path.
 */
static inline dns_qpref_t
alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) {
	dns_qpchunk_t chunk = qp->bump;
	dns_qpcell_t cell = qp->usage[chunk].used;

	if (cell + size <= QP_CHUNK_SIZE) {
		qp->usage[chunk].used += size;
		qp->used_count += size;
		return (make_ref(chunk, cell));
	} else {
		return (alloc_slow(qp, size));
	}
}

/*
 * Record that some twigs are no longer being used, and if possible
 * zero them to ensure that there isn't a spurious double detach when
 * the chunk is later recycled.
 *
 * Returns true if the twigs were immediately destroyed.
 *
 * NOTE: the caller is responsible for attaching or detaching any
 * leaves as required.
 */
static inline bool
free_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
	dns_qpchunk_t chunk = ref_chunk(twigs);

	qp->free_count += size;
	qp->usage[chunk].free += size;
	ENSURE(qp->free_count <= qp->used_count);
	ENSURE(qp->usage[chunk].free <= qp->usage[chunk].used);

	if (cells_immutable(qp, twigs)) {
		qp->hold_count += size;
		ENSURE(qp->free_count >= qp->hold_count);
		return (false);
	} else {
		zero_twigs(ref_ptr(qp, twigs), size);
		return (true);
	}
}

/*
 * When some twigs have been copied, and free_twigs() could not
 * immediately destroy the old copy, we need to update the refcount
 * on any leaves that were duplicated.
 */
static void
attach_twigs(dns_qp_t *qp, dns_qpnode_t *twigs, dns_qpweight_t size) {
	for (dns_qpweight_t pos = 0; pos < size; pos++) {
		if (node_tag(&twigs[pos]) == LEAF_TAG) {
			attach_leaf(qp, &twigs[pos]);
		}
	}
}

/***********************************************************************
 *
 *  chunk reclamation
 */

/*
 * Is any of this chunk still in use?
 */
static inline dns_qpcell_t
chunk_usage(dns_qp_t *qp, dns_qpchunk_t chunk) {
	return (qp->usage[chunk].used - qp->usage[chunk].free);
}

/*
 * We remove each empty chunk from the total counts when the chunk is
 * freed, or when it is scheduled for safe memory reclamation. We check
 * the chunk's phase to avoid discounting it twice in the latter case.
 */
static void
chunk_discount(dns_qp_t *qp, dns_qpchunk_t chunk) {
	if (qp->usage[chunk].discounted) {
		return;
	}
	INSIST(qp->used_count >= qp->usage[chunk].used);
	INSIST(qp->free_count >= qp->usage[chunk].free);
	qp->used_count -= qp->usage[chunk].used;
	qp->free_count -= qp->usage[chunk].free;
	qp->usage[chunk].discounted = true;
}

/*
 * When a chunk is being recycled, we need to detach any leaves that
 * remain, and free any `base` arrays that have been marked as unused.
 */
static void
chunk_free(dns_qp_t *qp, dns_qpchunk_t chunk) {
	if (qp->write_protect) {
		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
	}

	dns_qpnode_t *n = qp->base->ptr[chunk];
	for (dns_qpcell_t count = qp->usage[chunk].used; count > 0;
	     count--, n++)
	{
		if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) {
			detach_leaf(qp, n);
		} else if (count > 1 && reader_valid(n)) {
			dns_qpreader_t qpr;
			unpack_reader(&qpr, n);
			/* pairs with dns_qpmulti_commit() */
			if (qpbase_unref(&qpr)) {
				isc_mem_free(qp->mctx, qpr.base);
			}
		}
	}
	chunk_discount(qp, chunk);
	chunk_free_raw(qp, qp->base->ptr[chunk]);
	qp->base->ptr[chunk] = NULL;
	qp->usage[chunk] = (qp_usage_t){};
}

/*
 * Free any chunks that we can while a trie is in use.
 */
static void
recycle(dns_qp_t *qp) {
	unsigned int free = 0;

	isc_nanosecs_t start = isc_time_monotonic();

	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
		    qp->usage[chunk].exists && !qp->usage[chunk].immutable)
		{
			chunk_free(qp, chunk);
			free++;
		}
	}

	isc_nanosecs_t time = isc_time_monotonic() - start;
	atomic_fetch_add_relaxed(&recycle_time, time);

	if (free > 0) {
		LOG_STATS("qp recycle" PRItime "free %u chunks", time, free);
		LOG_STATS("qp recycle leaf %u live %u used %u free %u hold %u",
			  qp->leaf_count, qp->used_count - qp->free_count,
			  qp->used_count, qp->free_count, qp->hold_count);
	}
}

/*
 * asynchronous cleanup
 */
static void
reclaim_chunks_cb(struct rcu_head *arg) {
	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
	REQUIRE(QPRCU_VALID(rcuctx));
	dns_qpmulti_t *multi = rcuctx->multi;
	REQUIRE(QPMULTI_VALID(multi));

	LOCK(&multi->mutex);

	dns_qp_t *qp = &multi->writer;
	REQUIRE(QP_VALID(qp));

	unsigned int free = 0;
	isc_nanosecs_t start = isc_time_monotonic();

	for (unsigned int i = 0; i < rcuctx->count; i++) {
		dns_qpchunk_t chunk = rcuctx->chunk[i];
		if (qp->usage[chunk].snapshot) {
			/* cleanup when snapshot is destroyed */
			qp->usage[chunk].snapfree = true;
		} else {
			chunk_free(qp, chunk);
			free++;
		}
	}

	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));

	isc_nanosecs_t time = isc_time_monotonic() - start;
	recycle_time += time;

	if (free > 0) {
		LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free);
		LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u",
			  qp->leaf_count, qp->used_count - qp->free_count,
			  qp->used_count, qp->free_count, qp->hold_count);
	}

	UNLOCK(&multi->mutex);
}

/*
 * At the end of a transaction, schedule empty but immutable chunks
 * for reclamation later.
 */
static void
reclaim_chunks(dns_qpmulti_t *multi) {
	dns_qp_t *qp = &multi->writer;

	unsigned int count = 0;
	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
		    !qp->usage[chunk].discounted)
		{
			count++;
		}
	}

	if (count == 0) {
		return;
	}

	qp_rcuctx_t *rcuctx =
		isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count));
	*rcuctx = (qp_rcuctx_t){
		.magic = QPRCU_MAGIC,
		.multi = multi,
		.count = count,
	};
	isc_mem_attach(qp->mctx, &rcuctx->mctx);

	unsigned int i = 0;
	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
		    !qp->usage[chunk].discounted)
		{
			rcuctx->chunk[i++] = chunk;
			chunk_discount(qp, chunk);
		}
	}

	call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb);

	LOG_STATS("qp will reclaim %u chunks", count);
}

/*
 * When a snapshot is destroyed, clean up chunks that need free()ing
 * and are not used by any remaining snapshots.
 */
static void
marksweep_chunks(dns_qpmulti_t *multi) {
	unsigned int free = 0;

	isc_nanosecs_t start = isc_time_monotonic();

	dns_qp_t *qpw = &multi->writer;

	for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL;
	     qps = ISC_LIST_NEXT(qps, link))
	{
		for (dns_qpchunk_t chunk = 0; chunk < qps->chunk_max; chunk++) {
			if (qps->base->ptr[chunk] != NULL) {
				INSIST(qps->base->ptr[chunk] ==
				       qpw->base->ptr[chunk]);
				qpw->usage[chunk].snapmark = true;
			}
		}
	}

	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
		qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark;
		qpw->usage[chunk].snapmark = false;
		if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) {
			chunk_free(qpw, chunk);
			free++;
		}
	}

	isc_nanosecs_t time = isc_time_monotonic() - start;
	recycle_time += time;

	if (free > 0) {
		LOG_STATS("qp marksweep" PRItime "free %u chunks", time, free);
		LOG_STATS(
			"qp marksweep leaf %u live %u used %u free %u hold %u",
			qpw->leaf_count, qpw->used_count - qpw->free_count,
			qpw->used_count, qpw->free_count, qpw->hold_count);
	}
}

/***********************************************************************
 *
 *  garbage collector
 */

/*
 * Move a branch node's twigs to the `bump` chunk, for copy-on-write
 * or for garbage collection. We don't update the node in place
 * because `compact_recursive()` does not ensure the node itself is
 * mutable until after it discovers evacuation was necessary.
 *
 * If free_twigs() could not immediately destroy the old twigs, we have
 * to re-attach to any leaves.
 */
static dns_qpref_t
evacuate(dns_qp_t *qp, dns_qpnode_t *n) {
	dns_qpweight_t size = branch_twigs_size(n);
	dns_qpref_t old_ref = branch_twigs_ref(n);
	dns_qpref_t new_ref = alloc_twigs(qp, size);
	dns_qpnode_t *old_twigs = ref_ptr(qp, old_ref);
	dns_qpnode_t *new_twigs = ref_ptr(qp, new_ref);

	move_twigs(new_twigs, old_twigs, size);
	if (!free_twigs(qp, old_ref, size)) {
		attach_twigs(qp, new_twigs, size);
	}

	return (new_ref);
}

/*
 * Immutable nodes need copy-on-write. As we walk down the trie finding the
 * right place to modify, make_root_mutable() and make_twigs_mutable()
 * are called to ensure that immutable nodes on the path from the root are
 * copied to a mutable chunk.
 */

static inline dns_qpnode_t *
make_root_mutable(dns_qp_t *qp) {
	if (cells_immutable(qp, qp->root_ref)) {
		qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp));
	}
	return (ref_ptr(qp, qp->root_ref));
}

static inline void
make_twigs_mutable(dns_qp_t *qp, dns_qpnode_t *n) {
	if (cells_immutable(qp, branch_twigs_ref(n))) {
		*n = make_node(branch_index(n), evacuate(qp, n));
	}
}

/*
 * Compact the trie by traversing the whole thing recursively, copying
 * bottom-up as required. The aim is to avoid evacuation as much as
 * possible, but when parts of the trie are immutable, we need to evacuate
 * the paths from the root to the parts of the trie that occupy
 * fragmented chunks.
 *
 * Without the QP_MIN_USED check, the algorithm will leave the trie
 * unchanged. If the children are all leaves, the loop changes nothing,
 * so we will return this node's original ref. If all of the children
 * that are branches did not need moving, again, the loop changes
 * nothing. So the evacuation check is the only place that the
 * algorithm introduces ref changes, that then bubble up towards the
 * root through the logic inside the loop.
 */
static dns_qpref_t
compact_recursive(dns_qp_t *qp, dns_qpnode_t *parent) {
	dns_qpweight_t size = branch_twigs_size(parent);
	dns_qpref_t twigs_ref = branch_twigs_ref(parent);
	dns_qpchunk_t chunk = ref_chunk(twigs_ref);

	if (qp->compact_all ||
	    (chunk != qp->bump && chunk_usage(qp, chunk) < QP_MIN_USED))
	{
		twigs_ref = evacuate(qp, parent);
	}
	bool immutable = cells_immutable(qp, twigs_ref);
	for (dns_qpweight_t pos = 0; pos < size; pos++) {
		dns_qpnode_t *child = ref_ptr(qp, twigs_ref) + pos;
		if (!is_branch(child)) {
			continue;
		}
		dns_qpref_t old_grandtwigs = branch_twigs_ref(child);
		dns_qpref_t new_grandtwigs = compact_recursive(qp, child);
		if (old_grandtwigs == new_grandtwigs) {
			continue;
		}
		if (immutable) {
			twigs_ref = evacuate(qp, parent);
			/* the twigs have moved */
			child = ref_ptr(qp, twigs_ref) + pos;
			immutable = false;
		}
		*child = make_node(branch_index(child), new_grandtwigs);
	}
	return (twigs_ref);
}

static void
compact(dns_qp_t *qp) {
	LOG_STATS("qp compact before leaf %u live %u used %u free %u hold %u",
		  qp->leaf_count, qp->used_count - qp->free_count,
		  qp->used_count, qp->free_count, qp->hold_count);

	isc_nanosecs_t start = isc_time_monotonic();

	if (qp->usage[qp->bump].free > QP_MAX_FREE) {
		alloc_reset(qp);
	}

	if (qp->leaf_count > 0) {
		qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp));
	}
	qp->compact_all = false;

	isc_nanosecs_t time = isc_time_monotonic() - start;
	atomic_fetch_add_relaxed(&compact_time, time);

	LOG_STATS("qp compact" PRItime
		  "leaf %u live %u used %u free %u hold %u",
		  time, qp->leaf_count, qp->used_count - qp->free_count,
		  qp->used_count, qp->free_count, qp->hold_count);
}

void
dns_qp_compact(dns_qp_t *qp, dns_qpgc_t mode) {
	REQUIRE(QP_VALID(qp));
	if (mode == DNS_QPGC_MAYBE && !QP_NEEDGC(qp)) {
		return;
	}
	if (mode == DNS_QPGC_ALL) {
		alloc_reset(qp);
		qp->compact_all = true;
	}
	compact(qp);
	recycle(qp);
}

/*
 * Free some twigs and (if they were destroyed immediately so that the
 * result from QP_MAX_GARBAGE can change) compact the trie if necessary.
 *
 * This is called by the trie modification API entry points. The
 * free_twigs() function requires the caller to attach or detach any
 * leaves as necessary. Callers of squash_twigs() satisfy this
 * requirement by calling make_twigs_mutable().
 *
 * Aside: In typical garbage collectors, compaction is triggered when
 * the allocator runs out of space. But that is because typical garbage
 * collectors do not know how much memory can be recovered, so they must
 * find out by scanning the heap. The qp-trie code was originally
 * designed to use malloc() and free(), so it has more information about
 * when garbage collection might be worthwhile. Hence we can trigger
 * collection when garbage passes a threshold.
 *
 * XXXFANF: If we need to avoid latency outliers caused by compaction in
 * write transactions, we can check qp->transaction_mode here.
 */
static inline bool
squash_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
	bool destroyed = free_twigs(qp, twigs, size);
	if (destroyed && QP_AUTOGC(qp)) {
		compact(qp);
		recycle(qp);
		/*
		 * This shouldn't happen if the garbage collector is
		 * working correctly. We can recover at the cost of some
		 * time and space, but recovery should be cheaper than
		 * letting compact+recycle fail repeatedly.
		 */
		if (QP_AUTOGC(qp)) {
			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
				      DNS_LOGMODULE_QP, ISC_LOG_NOTICE,
				      "qp %p uctx \"%s\" compact/recycle "
				      "failed to recover any space, "
				      "scheduling a full compaction",
				      qp, TRIENAME(qp));
			qp->compact_all = true;
		}
	}
	return (destroyed);
}

/***********************************************************************
 *
 *  public accessors for memory management internals
 */

dns_qp_memusage_t
dns_qp_memusage(dns_qp_t *qp) {
	REQUIRE(QP_VALID(qp));

	dns_qp_memusage_t memusage = {
		.uctx = qp->uctx,
		.leaves = qp->leaf_count,
		.live = qp->used_count - qp->free_count,
		.used = qp->used_count,
		.hold = qp->hold_count,
		.free = qp->free_count,
		.node_size = sizeof(dns_qpnode_t),
		.chunk_size = QP_CHUNK_SIZE,
		.fragmented = QP_NEEDGC(qp),
	};

	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (qp->base->ptr[chunk] != NULL) {
			memusage.chunk_count += 1;
		}
	}

	/*
	 * XXXFANF does not subtract chunks that have been shrunk,
	 * and does not count unreclaimed dns_qpbase_t objects
	 */
	memusage.bytes = memusage.chunk_count * QP_CHUNK_BYTES +
			 qp->chunk_max * sizeof(qp->base->ptr[0]) +
			 qp->chunk_max * sizeof(qp->usage[0]);

	return (memusage);
}

dns_qp_memusage_t
dns_qpmulti_memusage(dns_qpmulti_t *multi) {
	REQUIRE(QPMULTI_VALID(multi));
	LOCK(&multi->mutex);

	dns_qp_t *qp = &multi->writer;
	INSIST(QP_VALID(qp));

	dns_qp_memusage_t memusage = dns_qp_memusage(qp);

	if (qp->transaction_mode == QP_UPDATE) {
		memusage.bytes -= QP_CHUNK_BYTES;
		memusage.bytes += qp->usage[qp->bump].used *
				  sizeof(dns_qpnode_t);
	}

	UNLOCK(&multi->mutex);
	return (memusage);
}

void
dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p,
	      isc_nanosecs_t *rollback_p) {
	*compact_p = atomic_load_relaxed(&compact_time);
	*recycle_p = atomic_load_relaxed(&recycle_time);
	*rollback_p = atomic_load_relaxed(&rollback_time);
}

/***********************************************************************
 *
 *  read-write transactions
 */

static dns_qp_t *
transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) {
	REQUIRE(QPMULTI_VALID(multi));
	REQUIRE(qptp != NULL && *qptp == NULL);

	LOCK(&multi->mutex);

	dns_qp_t *qp = &multi->writer;
	INSIST(QP_VALID(qp));

	/*
	 * Mark existing chunks as immutable.
	 *
	 * Aside: The bump chunk is special: in a series of write
	 * transactions the bump chunk is reused; the first part (up
	 * to fender) is immutable, the rest mutable. But we set its
	 * immutable flag so that when the bump chunk fills up, the
	 * first part continues to be treated as immutable. (And the
	 * rest of the chunk too, but that's OK.)
	 */
	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (qp->usage[chunk].exists) {
			qp->usage[chunk].immutable = true;
			write_protect(qp, chunk);
		}
	}

	/*
	 * Ensure QP_AUTOGC() ignores free space in immutable chunks.
	 */
	qp->hold_count = qp->free_count;

	*qptp = qp;
	return (qp);
}

/*
 * a write is light
 *
 * We need to ensure we allocate from a fresh chunk if the last transaction
 * shrunk the bump chunk; but usually in a sequence of write transactions
 * we just put `fender` at the point where we started this generation.
 *
 * (Aside: Instead of keeping the previous transaction's mode, I
 * considered forcing allocation into the slow path by fiddling with
 * the bump chunk's usage counters. But that is troublesome because
 * `chunk_free()` needs to know how much of the chunk to scan.)
 */
void
dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) {
	dns_qp_t *qp = transaction_open(multi, qptp);
	TRACE("");

	if (qp->transaction_mode == QP_WRITE) {
		qp->fender = qp->usage[qp->bump].used;
	} else {
		alloc_reset(qp);
	}
	qp->transaction_mode = QP_WRITE;
}

/*
 * an update is heavier
 *
 * We always reset the allocator to the start of a fresh chunk,
 * because the previous transaction was probably an update that shrunk
 * the bump chunk. It simplifies rollback because `fender` is always zero.
 *
 * To rollback a transaction, we need to reset all the allocation
 * counters to their previous state, in particular we need to un-free
 * any nodes that were copied to make them mutable. This means we need
 * to make a copy of basically the whole `dns_qp_t writer`: everything
 * but the chunks holding the trie nodes.
 *
 * We do most of the transaction setup before creating the rollback
 * state so that after rollback we have a correct idea of which chunks
 * are immutable, and so we have the correct transaction mode to make
 * the next transaction allocate a new bump chunk. The exception is
 * resetting the allocator, which we do after creating the rollback
 * state; if this transaction is rolled back then the next transaction
 * will start from the rollback state and also reset the allocator as
 * one of its first actions.
 */
void
dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) {
	dns_qp_t *qp = transaction_open(multi, qptp);
	TRACE("");

	qp->transaction_mode = QP_UPDATE;

	dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback));
	memmove(rollback, qp, sizeof(*rollback));
	/* can be uninitialized on the first transaction */
	if (rollback->base != NULL) {
		INSIST(QPBASE_VALID(rollback->base));
		INSIST(qp->usage != NULL && qp->chunk_max > 0);
		/* paired with either _commit() or _rollback() */
		isc_refcount_increment(&rollback->base->refcount);
		size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max;
		rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes);
		memmove(rollback->usage, qp->usage, usage_bytes);
	}
	INSIST(multi->rollback == NULL);
	multi->rollback = rollback;

	alloc_reset(qp);
}

void
dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) {
	REQUIRE(QPMULTI_VALID(multi));
	REQUIRE(qptp != NULL && *qptp == &multi->writer);
	REQUIRE(multi->writer.transaction_mode == QP_WRITE ||
		multi->writer.transaction_mode == QP_UPDATE);

	dns_qp_t *qp = *qptp;
	TRACE("");

	if (qp->transaction_mode == QP_UPDATE) {
		INSIST(multi->rollback != NULL);
		/* paired with dns_qpmulti_update() */
		if (qpbase_unref(multi->rollback)) {
			isc_mem_free(qp->mctx, multi->rollback->base);
		}
		if (multi->rollback->usage != NULL) {
			isc_mem_free(qp->mctx, multi->rollback->usage);
		}
		isc_mem_free(qp->mctx, multi->rollback);
	}
	INSIST(multi->rollback == NULL);

	/* not the first commit? */
	if (multi->reader_ref != INVALID_REF) {
		INSIST(cells_immutable(qp, multi->reader_ref));
		free_twigs(qp, multi->reader_ref, READER_SIZE);
	}

	if (qp->transaction_mode == QP_UPDATE) {
		/* minimize memory overhead */
		compact(qp);
		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
		qp->base->ptr[qp->bump] = chunk_shrink_raw(
			qp, qp->base->ptr[qp->bump],
			qp->usage[qp->bump].used * sizeof(dns_qpnode_t));
	} else {
		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
	}

	/* anchor a new version of the trie */
	dns_qpnode_t *reader = ref_ptr(qp, multi->reader_ref);
	make_reader(reader, multi);
	/* paired with chunk_free() */
	isc_refcount_increment(&qp->base->refcount);

	rcu_assign_pointer(multi->reader, reader); /* COMMIT */

	/* clean up what we can right now */
	if (qp->transaction_mode == QP_UPDATE || QP_NEEDGC(qp)) {
		recycle(qp);
	}

	/* schedule the rest for later */
	reclaim_chunks(multi);

	*qptp = NULL;
	UNLOCK(&multi->mutex);
}

/*
 * Throw away everything that was allocated during this transaction.
 */
void
dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) {
	unsigned int free = 0;

	REQUIRE(QPMULTI_VALID(multi));
	REQUIRE(multi->writer.transaction_mode == QP_UPDATE);
	REQUIRE(qptp != NULL && *qptp == &multi->writer);

	dns_qp_t *qp = *qptp;
	TRACE("");

	isc_nanosecs_t start = isc_time_monotonic();

	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable)
		{
			chunk_free(qp, chunk);
			/*
			 * we need to clear its base pointer in the rollback
			 * trie, in case the arrays were resized
			 */
			if (chunk < multi->rollback->chunk_max) {
				INSIST(!multi->rollback->usage[chunk].exists);
				multi->rollback->base->ptr[chunk] = NULL;
			}
			free++;
		}
	}

	/*
	 * multi->rollback->base and multi->writer->base are the same,
	 * unless there was a realloc_chunk_arrays() during the transaction
	 */
	if (qpbase_unref(qp)) {
		/* paired with dns_qpmulti_update() */
		isc_mem_free(qp->mctx, qp->base);
	}
	isc_mem_free(qp->mctx, qp->usage);

	/* reset allocator state */
	INSIST(multi->rollback != NULL);
	memmove(qp, multi->rollback, sizeof(*qp));
	isc_mem_free(qp->mctx, multi->rollback);
	INSIST(multi->rollback == NULL);

	isc_nanosecs_t time = isc_time_monotonic() - start;
	atomic_fetch_add_relaxed(&rollback_time, time);

	LOG_STATS("qp rollback" PRItime "free %u chunks", time, free);

	*qptp = NULL;
	UNLOCK(&multi->mutex);
}

/***********************************************************************
 *
 *  read-only transactions
 */

static dns_qpmulti_t *
reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) {
	dns_qpreader_t *qp = dns_qpreader(qpr);
	dns_qpnode_t *reader = rcu_dereference(multi->reader);
	if (reader == NULL) {
		QP_INIT(qp, multi->writer.methods, multi->writer.uctx);
	} else {
		multi = unpack_reader(qp, reader);
	}
	return (multi);
}

/*
 * a query is light
 */

void
dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) {
	REQUIRE(QPMULTI_VALID(multi));
	REQUIRE(qp != NULL);

	qp->tid = isc_tid();
	rcu_read_lock();

	dns_qpmulti_t *whence = reader_open(multi, qp);
	INSIST(whence == multi);
}

void
dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) {
	REQUIRE(QPMULTI_VALID(multi));
	REQUIRE(QP_VALID(qp));
	REQUIRE(qp->tid == isc_tid());
	*qp = (dns_qpread_t){};
	rcu_read_unlock();
}

/*
 * a snapshot is heavy
 */

void
dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
	REQUIRE(QPMULTI_VALID(multi));
	REQUIRE(qpsp != NULL && *qpsp == NULL);

	rcu_read_lock();

	LOCK(&multi->mutex);

	dns_qp_t *qpw = &multi->writer;
	size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) +
		       sizeof(qpw->base->ptr[0]) * qpw->chunk_max;
	dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes);

	qps->whence = reader_open(multi, qps);
	INSIST(qps->whence == multi);

	/* not a separate allocation */
	qps->base = (dns_qpbase_t *)(qps + 1);
	isc_refcount_init(&qps->base->refcount, 0);

	/*
	 * only copy base pointers of chunks we need, so we can
	 * reclaim unused memory in dns_qpsnap_destroy()
	 */
	qps->chunk_max = qpw->chunk_max;
	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
		if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) {
			qpw->usage[chunk].snapshot = true;
			qps->base->ptr[chunk] = qpw->base->ptr[chunk];
		} else {
			qps->base->ptr[chunk] = NULL;
		}
	}
	ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link);

	*qpsp = qps;
	UNLOCK(&multi->mutex);
}

void
dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
	REQUIRE(QPMULTI_VALID(multi));
	REQUIRE(qpsp != NULL && *qpsp != NULL);

	LOCK(&multi->mutex);

	dns_qpsnap_t *qp = *qpsp;

	/* make sure the API is being used correctly */
	REQUIRE(qp->whence == multi);

	ISC_LIST_UNLINK(multi->snapshots, qp, link);

	/*
	 * eagerly reclaim chunks that are now unused, so that memory does
	 * not accumulate when a trie has a lot of updates and snapshots
	 */
	marksweep_chunks(multi);

	isc_mem_free(multi->writer.mctx, qp);

	*qpsp = NULL;
	UNLOCK(&multi->mutex);

	rcu_read_unlock();
}

/***********************************************************************
 *
 *  constructors, destructors
 */

void
dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
	      dns_qp_t **qptp) {
	REQUIRE(qptp != NULL && *qptp == NULL);

	dns_qp_t *qp = isc_mem_get(mctx, sizeof(*qp));
	QP_INIT(qp, methods, uctx);
	isc_mem_attach(mctx, &qp->mctx);
	alloc_reset(qp);
	TRACE("");
	*qptp = qp;
}

void
dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
		   dns_qpmulti_t **qpmp) {
	REQUIRE(qpmp != NULL && *qpmp == NULL);

	dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi));
	*multi = (dns_qpmulti_t){
		.magic = QPMULTI_MAGIC,
		.reader_ref = INVALID_REF,
	};
	isc_mutex_init(&multi->mutex);
	ISC_LIST_INIT(multi->snapshots);
	/*
	 * Do not waste effort allocating a bump chunk that will be thrown
	 * away when a transaction is opened. dns_qpmulti_update() always
	 * allocates; to ensure dns_qpmulti_write() does too, pretend the
	 * previous transaction was an update
	 */
	dns_qp_t *qp = &multi->writer;
	QP_INIT(qp, methods, uctx);
	isc_mem_attach(mctx, &qp->mctx);
	qp->transaction_mode = QP_UPDATE;
	TRACE("");
	*qpmp = multi;
}

static void
destroy_guts(dns_qp_t *qp) {
	if (qp->chunk_max == 0) {
		return;
	}
	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
		if (qp->base->ptr[chunk] != NULL) {
			chunk_free(qp, chunk);
		}
	}
	ENSURE(qp->used_count == 0);
	ENSURE(qp->free_count == 0);
	ENSURE(isc_refcount_current(&qp->base->refcount) == 1);
	isc_mem_free(qp->mctx, qp->base);
	isc_mem_free(qp->mctx, qp->usage);
	qp->magic = 0;
}

void
dns_qp_destroy(dns_qp_t **qptp) {
	REQUIRE(qptp != NULL);
	REQUIRE(QP_VALID(*qptp));

	dns_qp_t *qp = *qptp;
	*qptp = NULL;

	/* do not try to destroy part of a dns_qpmulti_t */
	REQUIRE(qp->transaction_mode == QP_NONE);

	TRACE("");
	destroy_guts(qp);
	isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp));
}

static void
qpmulti_destroy_cb(struct rcu_head *arg) {
	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
	REQUIRE(QPRCU_VALID(rcuctx));
	/* only nonzero for reclaim_chunks_cb() */
	REQUIRE(rcuctx->count == 0);

	dns_qpmulti_t *multi = rcuctx->multi;
	REQUIRE(QPMULTI_VALID(multi));

	/* reassure thread sanitizer */
	LOCK(&multi->mutex);

	dns_qp_t *qp = &multi->writer;
	REQUIRE(QP_VALID(qp));

	destroy_guts(qp);

	UNLOCK(&multi->mutex);

	isc_mutex_destroy(&multi->mutex);
	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
	isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
}

void
dns_qpmulti_destroy(dns_qpmulti_t **qpmp) {
	dns_qp_t *qp = NULL;
	dns_qpmulti_t *multi = NULL;
	qp_rcuctx_t *rcuctx = NULL;

	REQUIRE(qpmp != NULL);
	REQUIRE(QPMULTI_VALID(*qpmp));

	multi = *qpmp;
	qp = &multi->writer;
	*qpmp = NULL;

	REQUIRE(QP_VALID(qp));
	REQUIRE(multi->rollback == NULL);
	REQUIRE(ISC_LIST_EMPTY(multi->snapshots));

	rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0));
	*rcuctx = (qp_rcuctx_t){
		.magic = QPRCU_MAGIC,
		.multi = multi,
	};
	isc_mem_attach(qp->mctx, &rcuctx->mctx);
	call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb);
}

/***********************************************************************
 *
 *  modification
 */

isc_result_t
dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) {
	dns_qpref_t new_ref, old_ref;
	dns_qpnode_t new_leaf, old_node;
	dns_qpnode_t *new_twigs = NULL, *old_twigs = NULL;
	dns_qpshift_t new_bit, old_bit;
	dns_qpweight_t old_size, new_size;
	dns_qpkey_t new_key, old_key;
	size_t new_keylen, old_keylen;
	size_t offset;
	uint64_t index;
	dns_qpshift_t bit;
	dns_qpweight_t pos;
	dns_qpnode_t *n = NULL;

	REQUIRE(QP_VALID(qp));

	new_leaf = make_leaf(pval, ival);
	new_keylen = leaf_qpkey(qp, &new_leaf, new_key);

	/* first leaf in an empty trie? */
	if (qp->leaf_count == 0) {
		new_ref = alloc_twigs(qp, 1);
		new_twigs = ref_ptr(qp, new_ref);
		*new_twigs = new_leaf;
		attach_leaf(qp, new_twigs);
		qp->leaf_count++;
		qp->root_ref = new_ref;
		return (ISC_R_SUCCESS);
	}

	/*
	 * We need to keep searching down to a leaf even if our key is
	 * missing from this branch. It doesn't matter which twig we
	 * choose since the keys are all the same up to this node's
	 * offset. Note that if we simply use branch_twig_pos(n, bit)
	 * we may get an out-of-bounds access if our bit is greater
	 * than all the set bits in the node.
	 */
	n = ref_ptr(qp, qp->root_ref);
	while (is_branch(n)) {
		prefetch_twigs(qp, n);
		dns_qpref_t ref = branch_twigs_ref(n);
		bit = branch_keybit(n, new_key, new_keylen);
		pos = branch_has_twig(n, bit) ? branch_twig_pos(n, bit) : 0;
		n = ref_ptr(qp, ref + pos);
	}

	/* do the keys differ, and if so, where? */
	old_keylen = leaf_qpkey(qp, n, old_key);
	offset = qpkey_compare(new_key, new_keylen, old_key, old_keylen);
	if (offset == QPKEY_EQUAL) {
		return (ISC_R_EXISTS);
	}
	new_bit = qpkey_bit(new_key, new_keylen, offset);
	old_bit = qpkey_bit(old_key, old_keylen, offset);

	/* find where to insert a branch or grow an existing branch. */
	n = make_root_mutable(qp);
	while (is_branch(n)) {
		prefetch_twigs(qp, n);
		if (offset < branch_key_offset(n)) {
			goto newbranch;
		}
		if (offset == branch_key_offset(n)) {
			goto growbranch;
		}
		make_twigs_mutable(qp, n);
		bit = branch_keybit(n, new_key, new_keylen);
		INSIST(branch_has_twig(n, bit));
		n = branch_twig_ptr(qp, n, bit);
	}
	/* fall through */

newbranch:
	new_ref = alloc_twigs(qp, 2);
	new_twigs = ref_ptr(qp, new_ref);

	/* save before overwriting. */
	old_node = *n;

	/* new branch node takes old node's place */
	index = BRANCH_TAG | (1ULL << new_bit) | (1ULL << old_bit) |
		((uint64_t)offset << SHIFT_OFFSET);
	*n = make_node(index, new_ref);

	/* populate twigs */
	new_twigs[old_bit > new_bit] = old_node;
	new_twigs[new_bit > old_bit] = new_leaf;

	attach_leaf(qp, &new_leaf);
	qp->leaf_count++;

	return (ISC_R_SUCCESS);

growbranch:
	INSIST(!branch_has_twig(n, new_bit));

	/* locate twigs vectors */
	old_size = branch_twigs_size(n);
	new_size = old_size + 1;
	old_ref = branch_twigs_ref(n);
	new_ref = alloc_twigs(qp, new_size);
	old_twigs = ref_ptr(qp, old_ref);
	new_twigs = ref_ptr(qp, new_ref);

	/* embiggen branch node */
	index = branch_index(n) | (1ULL << new_bit);
	*n = make_node(index, new_ref);

	/* embiggen twigs vector */
	pos = branch_twig_pos(n, new_bit);
	move_twigs(new_twigs, old_twigs, pos);
	new_twigs[pos] = new_leaf;
	move_twigs(new_twigs + pos + 1, old_twigs + pos, old_size - pos);

	if (squash_twigs(qp, old_ref, old_size)) {
		/* old twigs destroyed, only attach to new leaf */
		attach_leaf(qp, &new_leaf);
	} else {
		/* old twigs duplicated, attach to all leaves */
		attach_twigs(qp, new_twigs, new_size);
	}
	qp->leaf_count++;

	return (ISC_R_SUCCESS);
}

isc_result_t
dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key,
		 size_t search_keylen, void **pval_r, uint32_t *ival_r) {
	REQUIRE(QP_VALID(qp));
	REQUIRE(search_keylen < sizeof(dns_qpkey_t));

	if (get_root(qp) == NULL) {
		return (ISC_R_NOTFOUND);
	}

	dns_qpshift_t bit = 0; /* suppress warning */
	dns_qpnode_t *parent = NULL;
	dns_qpnode_t *n = make_root_mutable(qp);
	while (is_branch(n)) {
		prefetch_twigs(qp, n);
		bit = branch_keybit(n, search_key, search_keylen);
		if (!branch_has_twig(n, bit)) {
			return (ISC_R_NOTFOUND);
		}
		make_twigs_mutable(qp, n);
		parent = n;
		n = branch_twig_ptr(qp, n, bit);
	}

	dns_qpkey_t found_key;
	size_t found_keylen = leaf_qpkey(qp, n, found_key);
	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
	    QPKEY_EQUAL)
	{
		return (ISC_R_NOTFOUND);
	}

	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
	detach_leaf(qp, n);
	qp->leaf_count--;

	/* trie becomes empty */
	if (qp->leaf_count == 0) {
		INSIST(parent == NULL);
		INSIST(n == get_root(qp));
		free_twigs(qp, qp->root_ref, 1);
		qp->root_ref = INVALID_REF;
		return (ISC_R_SUCCESS);
	}

	/* step back to parent node */
	n = parent;
	parent = NULL;

	INSIST(bit != 0);
	dns_qpweight_t size = branch_twigs_size(n);
	dns_qpweight_t pos = branch_twig_pos(n, bit);
	dns_qpref_t ref = branch_twigs_ref(n);
	dns_qpnode_t *twigs = ref_ptr(qp, ref);

	if (size == 2) {
		/*
		 * move the other twig to the parent branch.
		 */
		*n = twigs[!pos];
		squash_twigs(qp, ref, 2);
	} else {
		/*
		 * shrink the twigs in place, to avoid using the bump
		 * chunk too fast - the gc will clean up after us
		 */
		*n = make_node(branch_index(n) & ~(1ULL << bit), ref);
		move_twigs(twigs + pos, twigs + pos + 1, size - pos - 1);
		squash_twigs(qp, ref + size - 1, 1);
	}

	return (ISC_R_SUCCESS);
}

isc_result_t
dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name, void **pval_r,
		  uint32_t *ival_r) {
	dns_qpkey_t key;
	size_t keylen = dns_qpkey_fromname(key, name);
	return (dns_qp_deletekey(qp, key, keylen, pval_r, ival_r));
}

/***********************************************************************
 *  chains
 */
static void
maybe_set_name(dns_qpreader_t *qp, dns_qpnode_t *node, dns_name_t *name) {
	dns_qpkey_t key;
	size_t len;

	if (name == NULL) {
		return;
	}

	dns_name_reset(name);
	len = leaf_qpkey(qp, node, key);
	dns_qpkey_toname(key, len, name);
}

void
dns_qpchain_init(dns_qpreadable_t qpr, dns_qpchain_t *chain) {
	dns_qpreader_t *qp = dns_qpreader(qpr);
	REQUIRE(QP_VALID(qp));
	REQUIRE(chain != NULL);

	*chain = (dns_qpchain_t){
		.magic = QPCHAIN_MAGIC,
		.qp = qp,
	};
}

unsigned int
dns_qpchain_length(dns_qpchain_t *chain) {
	REQUIRE(QPCHAIN_VALID(chain));

	return (chain->len);
}

void
dns_qpchain_node(dns_qpchain_t *chain, unsigned int level, dns_name_t *name,
		 void **pval_r, uint32_t *ival_r) {
	dns_qpnode_t *node = NULL;

	REQUIRE(QPCHAIN_VALID(chain));
	REQUIRE(level < chain->len);

	node = chain->chain[level].node;
	maybe_set_name(chain->qp, node, name);
	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
}

/***********************************************************************
 *  iterators
 */

void
dns_qpiter_init(dns_qpreadable_t qpr, dns_qpiter_t *qpi) {
	dns_qpreader_t *qp = dns_qpreader(qpr);
	REQUIRE(QP_VALID(qp));
	REQUIRE(qpi != NULL);
	*qpi = (dns_qpiter_t){
		.qp = qp,
		.magic = QPITER_MAGIC,
	};
}

/*
 * are we at the last twig in this branch (in whichever direction
 * we're iterating)?
 */
static bool
last_twig(dns_qpiter_t *qpi, bool forward) {
	dns_qpweight_t pos = 0, max = 0;
	if (qpi->sp > 0) {
		dns_qpnode_t *child = qpi->stack[qpi->sp];
		dns_qpnode_t *parent = qpi->stack[qpi->sp - 1];
		pos = child - ref_ptr(qpi->qp, branch_twigs_ref(parent));
		if (forward) {
			max = branch_twigs_size(parent) - 1;
		}
	}
	return (pos == max);
}

/*
 * move a QP iterator forward or back to the next or previous leaf.
 * note: this function can go wrong when the iterator refers to
 * a mutable view of the trie which is altered while iterating
 */
static isc_result_t
iterate(bool forward, dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
	uint32_t *ival_r) {
	dns_qpnode_t *node = NULL;
	bool initial_branch = true;

	REQUIRE(QPITER_VALID(qpi));

	dns_qpreader_t *qp = qpi->qp;

	REQUIRE(QP_VALID(qp));

	node = get_root(qp);
	if (node == NULL) {
		return (ISC_R_NOMORE);
	}

	do {
		if (qpi->stack[qpi->sp] == NULL) {
			/* newly initialized iterator: use the root node */
			INSIST(qpi->sp == 0);
			qpi->stack[0] = node;
		} else if (!initial_branch) {
			/*
			 * in a prior loop, we reached a branch; from
			 * here we just need to get the highest or lowest
			 * leaf in the subtree; we don't need to bother
			 * stepping forward or backward through twigs
			 * anymore.
			 */
			INSIST(qpi->sp > 0);
		} else if (last_twig(qpi, forward)) {
			/*
			 * we've stepped to the end (or the beginning,
			 * if we're iterating backwards) of a set of twigs.
			 */
			if (qpi->sp == 0) {
				/*
				 * we've finished iterating. reinitialize
				 * the iterator, then return ISC_R_NOMORE.
				 */
				dns_qpiter_init(qpi->qp, qpi);
				return (ISC_R_NOMORE);
			}

			/*
			 * pop the stack, and resume at the parent branch.
			 */
			qpi->stack[qpi->sp] = NULL;
			qpi->sp--;
			continue;
		} else {
			/*
			 * there are more twigs in the current branch,
			 * so step the node pointer forward (or back).
			 */
			qpi->stack[qpi->sp] += (forward ? 1 : -1);
			node = qpi->stack[qpi->sp];
		}

		/*
		 * if we're at a branch now, we loop down to the
		 * left- or rightmost leaf.
		 */
		if (is_branch(node)) {
			qpi->sp++;
			INSIST(qpi->sp < DNS_QP_MAXKEY);
			node = ref_ptr(qp, branch_twigs_ref(node)) +
			       (forward ? 0 : branch_twigs_size(node) - 1);
			qpi->stack[qpi->sp] = node;
			initial_branch = false;
		}
	} while (is_branch(node));

	/* we're at a leaf: return its data to the caller */
	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
	maybe_set_name(qpi->qp, node, name);
	return (ISC_R_SUCCESS);
}

isc_result_t
dns_qpiter_next(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
		uint32_t *ival_r) {
	return (iterate(true, qpi, name, pval_r, ival_r));
}

isc_result_t
dns_qpiter_prev(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
		uint32_t *ival_r) {
	return (iterate(false, qpi, name, pval_r, ival_r));
}

isc_result_t
dns_qpiter_current(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
		   uint32_t *ival_r) {
	dns_qpnode_t *node = NULL;

	REQUIRE(QPITER_VALID(qpi));

	node = qpi->stack[qpi->sp];
	if (node == NULL || is_branch(node)) {
		return (ISC_R_FAILURE);
	}

	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
	maybe_set_name(qpi->qp, node, name);
	return (ISC_R_SUCCESS);
}

/***********************************************************************
 *
 *  search
 */

isc_result_t
dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key,
	      size_t search_keylen, void **pval_r, uint32_t *ival_r) {
	dns_qpreader_t *qp = dns_qpreader(qpr);
	dns_qpkey_t found_key;
	size_t found_keylen;
	dns_qpshift_t bit;
	dns_qpnode_t *n = NULL;

	REQUIRE(QP_VALID(qp));
	REQUIRE(search_keylen < sizeof(dns_qpkey_t));

	n = get_root(qp);
	if (n == NULL) {
		return (ISC_R_NOTFOUND);
	}

	while (is_branch(n)) {
		prefetch_twigs(qp, n);
		bit = branch_keybit(n, search_key, search_keylen);
		if (!branch_has_twig(n, bit)) {
			return (ISC_R_NOTFOUND);
		}
		n = branch_twig_ptr(qp, n, bit);
	}

	found_keylen = leaf_qpkey(qp, n, found_key);
	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
	    QPKEY_EQUAL)
	{
		return (ISC_R_NOTFOUND);
	}

	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
	return (ISC_R_SUCCESS);
}

isc_result_t
dns_qp_getname(dns_qpreadable_t qpr, const dns_name_t *name, void **pval_r,
	       uint32_t *ival_r) {
	dns_qpkey_t key;
	size_t keylen = dns_qpkey_fromname(key, name);
	return (dns_qp_getkey(qpr, key, keylen, pval_r, ival_r));
}

static inline void
add_link(dns_qpchain_t *chain, dns_qpnode_t *node, size_t offset) {
	/* prevent duplication */
	if (chain->len != 0 && chain->chain[chain->len - 1].node == node) {
		return;
	}
	chain->chain[chain->len].node = node;
	chain->chain[chain->len].offset = offset;
	chain->len++;
	INSIST(chain->len <= DNS_NAME_MAXLABELS);
}

static inline void
prevleaf(dns_qpiter_t *it) {
	isc_result_t result = dns_qpiter_prev(it, NULL, NULL, NULL);
	if (result == ISC_R_NOMORE) {
		result = dns_qpiter_prev(it, NULL, NULL, NULL);
	}
	RUNTIME_CHECK(result == ISC_R_SUCCESS);
}

static inline dns_qpnode_t *
greatest_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpiter_t *iter) {
	while (is_branch(n)) {
		dns_qpref_t ref = branch_twigs_ref(n) + branch_twigs_size(n) -
				  1;
		iter->stack[++iter->sp] = n;
		n = ref_ptr(qpr, ref);
	}
	return (n);
}

isc_result_t
dns_qp_lookup(dns_qpreadable_t qpr, const dns_name_t *name,
	      dns_name_t *foundname, dns_qpiter_t *iter, dns_qpchain_t *chain,
	      void **pval_r, uint32_t *ival_r) {
	dns_qpreader_t *qp = dns_qpreader(qpr);
	dns_qpkey_t search, found;
	size_t searchlen, foundlen;
	size_t offset;
	dns_qpnode_t *n = NULL;
	dns_qpchain_t oc;
	dns_qpiter_t it;
	bool matched = true;
	bool getpred = true;

	REQUIRE(QP_VALID(qp));
	REQUIRE(foundname == NULL || ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));

	searchlen = dns_qpkey_fromname(search, name);

	if (chain == NULL) {
		chain = &oc;
	}
	if (iter == NULL) {
		iter = &it;
		getpred = false;
	}
	dns_qpchain_init(qp, chain);
	dns_qpiter_init(qp, iter);

	n = get_root(qp);
	if (n == NULL) {
		return (ISC_R_NOTFOUND);
	}
	iter->stack[0] = n;

	/*
	 * Like `dns_qp_insert()`, we must find a leaf. However, we don't make a
	 * second pass: instead, we keep track of any leaves with shorter keys
	 * that we discover along the way. (In general, qp-trie searches can be
	 * one-pass, by recording their traversal, or two-pass, for less stack
	 * memory usage.)
	 */
	while (is_branch(n)) {
		prefetch_twigs(qp, n);

		offset = branch_key_offset(n);
		dns_qpshift_t bit = qpkey_bit(search, searchlen, offset);
		dns_qpnode_t *twigs = branch_twigs(qp, n);

		/*
		 * A shorter key that can be a parent domain always has a
		 * leaf node at SHIFT_NOBYTE (indicating end of its key)
		 * where our search key has a normal character immediately
		 * after a label separator.
		 *
		 * Note 1: It is OK if `off - 1` underflows: it will
		 * become SIZE_MAX, which is greater than `searchlen`, so
		 * `qpkey_bit()` will return SHIFT_NOBYTE, which is what we
		 * want when `off == 0`.
		 *
		 * Note 2: If SHIFT_NOBYTE twig is present, it will always
		 * be in position 0, the first localtion in 'twigs'.
		 */
		if (bit != SHIFT_NOBYTE && branch_has_twig(n, SHIFT_NOBYTE) &&
		    qpkey_bit(search, searchlen, offset - 1) == SHIFT_NOBYTE &&
		    !is_branch(twigs))
		{
			add_link(chain, twigs, offset);
		}

		matched = branch_has_twig(n, bit);
		if (matched) {
			/*
			 * found a match: if it's a branch, we keep
			 * searching, and if it's a leaf, we drop out of
			 * the loop.
			 */
			n = branch_twig_ptr(qp, n, bit);
		} else if (getpred) {
			/*
			 * this branch is a dead end. however, the caller
			 * passed us an iterator, so we need to find the
			 * predecessor of the searched-for-name.
			 * first step: find out if we've overshot
			 * the search key; we do that by finding an
			 * arbitrary leaf to compare against.
			 */
			size_t to;
			dns_qpnode_t *least = n;
			while (is_branch(least)) {
				least = branch_twigs(qp, least);
			}
			foundlen = leaf_qpkey(qp, least, found);
			to = qpkey_compare(search, searchlen, found, foundlen);
			if (to == offset) {
				/*
				 * we're on the right branch, so find
				 * the best match.
				 */

				dns_qpweight_t pos = branch_twig_pos(n, bit);
				if (pos == 0) {
					/*
					 * every leaf in the branch is greater
					 * than the one we wanted; use the
					 * iterator to walk back to the
					 * predecessor.
					 */
					prevleaf(iter);
					n = iter->stack[iter->sp--];
				} else {
					/*
					 * the name we want would've been
					 * after some twig in this
					 * branch. point n to that twig,
					 * then walk down to the highest
					 * leaf in that subtree to get the
					 * predecessor.
					 */
					n = greatest_leaf(qp, twigs + pos - 1,
							  iter);
				}
			} else if (to <= searchlen && to <= foundlen &&
				   search[to] < found[to])
			{
				/*
				 * every leaf is greater than the one
				 * we wanted, so iterate back to the
				 * predecessor.
				 */
				prevleaf(iter);
				n = iter->stack[iter->sp--];
			} else {
				/*
				 * every leaf is less than the one we
				 * wanted, so get the highest.
				 */
				n = greatest_leaf(qp, n, iter);
			}
		} else {
			/*
			 * this branch is a dead end, and the predecessor
			 * doesn't matter. now we just need to find a leaf
			 * to end on so qpkey_leaf() will work below.
			 */
			if (chain->len > 0) {
				/* we saved an ancestor leaf: use that */
				n = chain->chain[chain->len - 1].node;
			} else {
				/* walk down to find the leftmost leaf */
				while (is_branch(twigs)) {
					twigs = branch_twigs(qp, twigs);
				}
				n = twigs;
			}
		}

		iter->stack[++iter->sp] = n;
	}

	/* do the keys differ, and if so, where? */
	foundlen = leaf_qpkey(qp, n, found);
	offset = qpkey_compare(search, searchlen, found, foundlen);

	/*
	 * if we've been passed an iterator, we want it to point
	 * at the matching name in the case of an exact match, or at
	 * the predecessor name for a non-exact match.
	 *
	 * if 'matched' is true, then the search ended at a leaf.
	 * if it was not an exact match, then we're now pointing
	 * at either the immediate predecessor or the immediate
	 * successor of the searched-for name; if successor, we can
	 * now use the qpiter stack we've constructed to step back to
	 * the predecessor. if we're pointed at the predecessor
	 * or it was an exact match, we don't need to do anything.
	 *
	 * if 'matched' is false, then the search failed at a branch
	 * node, and we would already have positioned the iterator
	 * at the predecessor.
	 */
	if (getpred && matched) {
		if (offset != QPKEY_EQUAL &&
		    (offset <= searchlen && offset <= foundlen &&
		     found[offset] > search[offset]))
		{
			prevleaf(iter);
		}
	}

	if (offset == QPKEY_EQUAL || offset == foundlen) {
		SET_IF_NOT_NULL(pval_r, leaf_pval(n));
		SET_IF_NOT_NULL(ival_r, leaf_ival(n));
		maybe_set_name(qp, n, foundname);
		add_link(chain, n, offset);
		if (offset == QPKEY_EQUAL) {
			return (ISC_R_SUCCESS);
		} else {
			return (DNS_R_PARTIALMATCH);
		}
	}

	/*
	 * the requested name was not found, but if an ancestor
	 * was, we can retrieve that from the chain.
	 */
	int len = chain->len;
	while (len-- > 0) {
		if (offset >= chain->chain[len].offset) {
			n = chain->chain[len].node;
			SET_IF_NOT_NULL(pval_r, leaf_pval(n));
			SET_IF_NOT_NULL(ival_r, leaf_ival(n));
			maybe_set_name(qp, n, foundname);
			return (DNS_R_PARTIALMATCH);
		} else {
			/*
			 * oops, during the search we found and added
			 * a leaf that's longer than the requested
			 * name; remove it from the chain.
			 */
			chain->len--;
		}
	}

	/* nothing was found at all */
	return (ISC_R_NOTFOUND);
}

/**********************************************************************/