From d939d2ecde5639d11acd6eac33a997b3e3c78b02 Mon Sep 17 00:00:00 2001
From: Matthijs Mekking <matthijs@isc.org>
Date: Fri, 2 Sep 2022 16:50:39 +0200
Subject: [PATCH] Only refresh RRset once

Don't attempt to resolve DNS responses for intermediate results. This
may create multiple refreshes and can cause a crash.

One scenario is where for the query there is a CNAME and canonical
answer in cache that are both stale. This will trigger a refresh of
the RRsets because we encountered stale data and we prioritized it over
the lookup. It will trigger a refresh of both RRsets. When we start
recursing, it will detect a recursion loop because the recursion
parameters will eventually be the same. In 'dns_resolver_destroyfetch'
the sanity check fails, one of the callers did not get its event back
before trying to destroy the fetch.

Move the call to 'query_refresh_rrset' to 'ns_query_done', so that it
is only called once per client request.

Another scenario is where for the query there is a stale CNAME in the
cache that points to a record that is also in cache but not stale. This
will trigger a refresh of the RRset (because we encountered stale data
and we prioritized it over the lookup).

We mark RRsets that we add to the message with
DNS_RDATASETATTR_STALE_ADDED to prevent adding a duplicate RRset when
a stale lookup and a normal lookup conflict with each other. However,
the other non-stale RRset when following a CNAME chain will be added to
the message without setting that attribute, because it is not stale.

This is a variant of the bug in #2594. The fix covered the same crash
but for stale-answer-client-timeout > 0.

Fix this by clearing all RRsets from the message before refreshing.
This requires the refresh to happen after the query is send back to
the client.
---
 lib/ns/include/ns/query.h |  1 +
 lib/ns/query.c            | 42 ++++++++++++++++++++++++---------------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/lib/ns/include/ns/query.h b/lib/ns/include/ns/query.h
index 843de665ea..c1c7b5e430 100644
--- a/lib/ns/include/ns/query.h
+++ b/lib/ns/include/ns/query.h
@@ -203,6 +203,7 @@ struct query_ctx {
 	bool authoritative;		    /* authoritative query? */
 	bool want_restart;		    /* CNAME chain or other
 					     * restart needed */
+	bool		refresh_rrset;	    /* stale RRset refresh needed */
 	bool		need_wildcardproof; /* wildcard proof needed */
 	bool		nxrewrite;	    /* negative answer from RPZ */
 	bool		findcoveringnsec;   /* lookup covering NSEC */
diff --git a/lib/ns/query.c b/lib/ns/query.c
index beeae0e653..dd2d2f2e0a 100644
--- a/lib/ns/query.c
+++ b/lib/ns/query.c
@@ -5780,7 +5780,6 @@ query_lookup(query_ctx_t *qctx) {
 	bool dbfind_stale = false;
 	bool stale_timeout = false;
 	bool stale_found = false;
-	bool refresh_rrset = false;
 	bool stale_refresh_window = false;
 	uint16_t ede = 0;
 
@@ -5980,8 +5979,7 @@ query_lookup(query_ctx_t *qctx) {
 					"%s stale answer used, an attempt to "
 					"refresh the RRset will still be made",
 					namebuf);
-				refresh_rrset = STALE(qctx->rdataset);
-				qctx->client->nodetach = refresh_rrset;
+				qctx->refresh_rrset = STALE(qctx->rdataset);
 				ns_client_extendederror(
 					qctx->client, ede,
 					"stale data prioritized over lookup");
@@ -6025,17 +6023,6 @@ query_lookup(query_ctx_t *qctx) {
 
 	result = query_gotanswer(qctx, result);
 
-	if (refresh_rrset) {
-		/*
-		 * If we reached this point then it means that we have found a
-		 * stale RRset entry in cache and BIND is configured to allow
-		 * queries to be answered with stale data if no active RRset
-		 * is available, i.e. "stale-anwer-client-timeout 0". But, we
-		 * still need to refresh the RRset.
-		 */
-		query_refresh_rrset(qctx);
-	}
-
 cleanup:
 	return (result);
 }
@@ -7987,11 +7974,14 @@ query_addanswer(query_ctx_t *qctx) {
 
 	/*
 	 * On normal lookups, clear any rdatasets that were added on a
-	 * lookup due to stale-answer-client-timeout.
+	 * lookup due to stale-answer-client-timeout. Do not clear if we
+	 * are going to refresh the RRset, because the stale contents are
+	 * prioritized.
 	 */
 	if (QUERY_STALEOK(&qctx->client->query) &&
-	    !QUERY_STALETIMEOUT(&qctx->client->query))
+	    !QUERY_STALETIMEOUT(&qctx->client->query) && !qctx->refresh_rrset)
 	{
+		CCTRACE(ISC_LOG_DEBUG(3), "query_clear_stale");
 		query_clear_stale(qctx->client);
 		/*
 		 * We can clear the attribute to prevent redundant clearing
@@ -11523,9 +11513,29 @@ ns_query_done(query_ctx_t *qctx) {
 	/*
 	 * Client may have been detached after query_send(), so
 	 * we test and store the flag state here, for safety.
+	 * If we are refreshing the RRSet, we must not detach from the client
+	 * in the query_send(), so we need to override the flag.
 	 */
+	if (qctx->refresh_rrset) {
+		qctx->client->nodetach = true;
+	}
 	nodetach = qctx->client->nodetach;
 	query_send(qctx->client);
+
+	if (qctx->refresh_rrset) {
+		/*
+		 * If we reached this point then it means that we have found a
+		 * stale RRset entry in cache and BIND is configured to allow
+		 * queries to be answered with stale data if no active RRset
+		 * is available, i.e. "stale-anwer-client-timeout 0". But, we
+		 * still need to refresh the RRset. To prevent adding duplicate
+		 * RRsets, clear the RRsets from the message before doing the
+		 * refresh.
+		 */
+		message_clearrdataset(qctx->client->message, 0);
+		query_refresh_rrset(qctx);
+	}
+
 	if (!nodetach) {
 		qctx->detach_client = true;
 	}