From 87591de6f73590c743a053a21052f98e6762012c Mon Sep 17 00:00:00 2001 From: Matthijs Mekking Date: Wed, 10 Mar 2021 15:14:56 +0100 Subject: [PATCH] Fix servestale fetchlimits crash When we query the resolver for a domain name that is in the same zone for which is already one or more fetches outstanding, we could potentially hit the fetch limits. If so, recursion fails immediately for the incoming query and if serve-stale is enabled, we may try to return a stale answer. If the resolver is also is authoritative for the parent zone (for example the root zone), first a delegation is found, but we first check the cache for a better response. Nothing is found in the cache, so we try to recurse to find the answer to the query. Because of fetch-limits 'dns_resolver_createfetch()' returns an error, which 'ns_query_recurse()' propagates to the caller, 'query_delegation_recurse()'. Because serve-stale is enabled, 'query_usestale()' is called, setting 'qctx->db' to the cache db, but leaving 'qctx->version' untouched. Now 'query_lookup()' is called to search for stale data in the cache database with a non-NULL 'qctx->version' (which is set to a zone db version), and thus we hit an assertion in rbtdb. This crash was introduced in 'main' by commit 8bcd7fe69e5343071fc917738d6092a8b974ef3f. --- CHANGES | 6 ++ .../system/serve-stale/ns3/named7.conf.in | 55 +++++++++++++ bin/tests/system/serve-stale/tests.sh | 77 ++++++++++++++++++- lib/ns/query.c | 1 + 4 files changed, 136 insertions(+), 3 deletions(-) create mode 100644 bin/tests/system/serve-stale/ns3/named7.conf.in diff --git a/CHANGES b/CHANGES index c182d8b4b1..e369897266 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,9 @@ +5597. [bug] When serve-stale was enabled and starting the recursive + resolution process for a query failed, a named instance + could crash if it was configured as both a recursive and + authoritative server. This problem was introduced by + change 5573 and has now been fixed. [GL #2565] + 5596. [func] Client-side support for DNS-over-HTTPS (DoH) has been added to dig. "dig +https" can now query a server via HTTP/2. [GL #1641] diff --git a/bin/tests/system/serve-stale/ns3/named7.conf.in b/bin/tests/system/serve-stale/ns3/named7.conf.in new file mode 100644 index 0000000000..403a6fa9e2 --- /dev/null +++ b/bin/tests/system/serve-stale/ns3/named7.conf.in @@ -0,0 +1,55 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + + +/* + * Test serve-stale interaction with fetch-limits (dual-mode). + */ + +key rndc_key { + secret "1234abcd8765"; + algorithm hmac-sha256; +}; + +controls { + inet 10.53.0.3 port @CONTROLPORT@ allow { any; } keys { rndc_key; }; +}; + +options { + query-source address 10.53.0.3; + notify-source 10.53.0.3; + transfer-source 10.53.0.3; + port @PORT@; + pid-file "named.pid"; + listen-on { 10.53.0.3; }; + listen-on-v6 { none; }; + dnssec-validation no; + recursion yes; + /* + * stale-answer-enable is not strictly required because serving + * stale answers is enabled in the test via rndc. + */ + stale-answer-enable yes; + stale-cache-enable yes; + stale-answer-ttl 3; + stale-answer-client-timeout disabled; + stale-refresh-time 4; + resolver-query-timeout 10; + fetches-per-zone 1 fail; + fetches-per-server 1 fail; + max-stale-ttl 3600; +}; + +zone "." { + type secondary; + primaries { 10.53.0.1; }; + file "root.bk"; +}; diff --git a/bin/tests/system/serve-stale/tests.sh b/bin/tests/system/serve-stale/tests.sh index 8ca4cf1511..1d50abcae1 100755 --- a/bin/tests/system/serve-stale/tests.sh +++ b/bin/tests/system/serve-stale/tests.sh @@ -2032,9 +2032,9 @@ if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) #################################################################### -# Test if fetch-limits quota is reached, stale data is served. # -#################################################################### -echo_i "test stale data with fetch-limits" +# Test serve-stale's interaction with fetch limits (cache only) # +################################################################# +echo_i "test serve-stale's interaction with fetch-limits (cache only)" # We update the named configuration to enable fetch-limits. The fetch-limits # are set to 1, which is ridiciously low, but that is because for this test we @@ -2140,5 +2140,76 @@ grep "data\.example\..*3.*IN.*TXT.*A text record with a 2 second ttl" dig.out.te if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) +######################################################################## +# Test serve-stale's interaction with fetch limits (dual-mode) # +######################################################################## +echo_i "test serve-stale's interaction with fetch limits (dual-mode)" + +# Update named configuration so that ns3 becomes a recursive resolver which is +# also a secondary server for the root zone. +n=$((n+1)) +echo_i "updating ns3/named.conf ($n)" +ret=0 +copy_setports ns3/named7.conf.in ns3/named.conf +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "running 'rndc reload' ($n)" +ret=0 +rndc_reload ns3 10.53.0.3 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# Flush the cache to ensure the example/NS RRset cached during previous tests +# does not override the authoritative delegation found in the root zone. +n=$((n+1)) +echo_i "flush cache ($n)" +ret=0 +$RNDCCMD 10.53.0.3 flush > rndc.out.test$n 2>&1 || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# Query name server with low fetch limits. The authoritative server (ans2) is +# not responding. Sending queries for multiple names in the 'example' zone +# in parallel causes the fetch limit for that zone (set to 1) to be +# reached. This should not trigger a crash. +echo_i "sending queries for tests $((n+1))-$((n+4))..." +$DIG -p ${PORT} @10.53.0.3 data.example TXT > dig.out.test$((n+1)) & +$DIG -p ${PORT} @10.53.0.3 othertype.example CAA > dig.out.test$((n+2)) & +$DIG -p ${PORT} @10.53.0.3 nodata.example TXT > dig.out.test$((n+3)) & +$DIG -p ${PORT} @10.53.0.3 nxdomain.example TXT > dig.out.test$((n+4)) + +wait + +# Expect SERVFAIL for the entries not in cache. +n=$((n+1)) +echo_i "check stale data.example (fetch-limits dual-mode) ($n)" +ret=0 +grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "check stale othertype.example (fetch-limits dual-mode) ($n)" +ret=0 +grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "check stale nodata.example (fetch-limits dual-mode) ($n)" +ret=0 +grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "check stale nxdomain.example (fetch-limits dual-mode) ($n)" +ret=0 +grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + echo_i "exit status: $status" [ $status -eq 0 ] || exit 1 diff --git a/lib/ns/query.c b/lib/ns/query.c index 96150d29a8..da4943250c 100644 --- a/lib/ns/query.c +++ b/lib/ns/query.c @@ -7462,6 +7462,7 @@ query_usestale(query_ctx_t *qctx, isc_result_t result) { if (dns_view_staleanswerenabled(qctx->client->view)) { dns_db_attach(qctx->client->view->cachedb, &qctx->db); + qctx->version = NULL; qctx->client->query.dboptions |= DNS_DBFIND_STALEOK; if (qctx->client->query.fetch != NULL) { dns_resolver_destroyfetch(&qctx->client->query.fetch);