From 5f47c2b567b19bfedf23c9a66455ab265d48d06f Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 6 Jun 2024 09:16:57 +0000 Subject: [PATCH 1/4] Allow shorter resolver-query-timeout configuration There are use cases for which shorter timeout values make sense. For example if there is a load balancer which sets RD=1 and forwards queries to a BIND resolver which is then configured to talk to backend servers which are not visible in the public NS set. WIth a shorter timeout value the frontend can give back SERVFAIL early when backends are not available and the ultimate client will not penalize the BIND-frontend for non-response. --- lib/dns/resolver.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index 29c4b9b3af..382a4e98aa 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -173,11 +173,6 @@ #define MAX_SINGLE_QUERY_TIMEOUT 9000U #define MAX_SINGLE_QUERY_TIMEOUT_US (MAX_SINGLE_QUERY_TIMEOUT * US_PER_MS) -/* - * We need to allow a individual query time to complete / timeout. - */ -#define MINIMUM_QUERY_TIMEOUT (MAX_SINGLE_QUERY_TIMEOUT + 1000U) - /* * The default maximum number of validations and validation failures per-fetch */ @@ -188,9 +183,24 @@ #define DEFAULT_MAX_VALIDATION_FAILURES 1 #endif -/* The default time in seconds for the whole query to live. */ +/* + * A minumum sane timeout value for the whole query to live when e.g. talking to + * a backend server and a quick timeout is preferred by the user. + * + * IMPORTANT: if changing this value, note there is a documented behavior when + * values of 'resolver-query-timeout' less than or equal to 300 are treated as + * seconds and converted to milliseconds before applying the limits, that's + * why the value of 301 was chosen as the absolute minimum in order to not break + * backward compatibility. + */ +#define MINIMUM_QUERY_TIMEOUT 301U + +/* + * The default time in seconds for the whole query to live. + * We want to allow an individual query time to complete / timeout. + */ #ifndef DEFAULT_QUERY_TIMEOUT -#define DEFAULT_QUERY_TIMEOUT MINIMUM_QUERY_TIMEOUT +#define DEFAULT_QUERY_TIMEOUT (MAX_SINGLE_QUERY_TIMEOUT + 1000U) #endif /* ifndef DEFAULT_QUERY_TIMEOUT */ /* The maximum time in seconds for the whole query to live. */ @@ -1201,6 +1211,9 @@ fctx_cancelquery(resquery_t **queryp, isc_time_t *finish, bool no_response, if (rtt > MAX_SINGLE_QUERY_TIMEOUT_US) { rtt = MAX_SINGLE_QUERY_TIMEOUT_US; } + if (rtt > fctx->res->query_timeout * US_PER_MS) { + rtt = fctx->res->query_timeout * US_PER_MS; + } /* * Replace the current RTT with our value. @@ -1861,6 +1874,9 @@ fctx_setretryinterval(fetchctx_t *fctx, unsigned int rtt) { if (us > MAX_SINGLE_QUERY_TIMEOUT_US) { us = MAX_SINGLE_QUERY_TIMEOUT_US; } + if (us > fctx->res->query_timeout * US_PER_MS) { + us = fctx->res->query_timeout * US_PER_MS; + } seconds = us / US_PER_SEC; us -= seconds * US_PER_SEC; @@ -10861,7 +10877,7 @@ void dns_resolver_settimeout(dns_resolver_t *resolver, unsigned int timeout) { REQUIRE(VALID_RESOLVER(resolver)); - if (timeout <= 300) { + if (timeout < MINIMUM_QUERY_TIMEOUT) { timeout *= 1000; } From d6a79cce534a2ee88c659c757c30771c8103c901 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 6 Jun 2024 09:20:44 +0000 Subject: [PATCH 2/4] Test shorter resolver-query-timeout configuration Add two new checks which test the shorter than usual resolver-query-timeout configuration. --- bin/tests/system/resolver/ans3/ans.pl | 12 +++++++++-- bin/tests/system/resolver/ns1/named.conf.in | 1 + bin/tests/system/resolver/tests.sh | 22 +++++++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/bin/tests/system/resolver/ans3/ans.pl b/bin/tests/system/resolver/ans3/ans.pl index 85d46cd4eb..02a8c1d6a6 100644 --- a/bin/tests/system/resolver/ans3/ans.pl +++ b/bin/tests/system/resolver/ans3/ans.pl @@ -74,6 +74,10 @@ sub handleQuery { $packet->push("answer", new Net::DNS::RR($qname . " 300 A 10.53.0.3")); } elsif ($qname eq "nodata.example.net") { # Do not add a SOA RRset. + } elsif ($qname eq "noresponse.example.net") { + # Do not response. + print "RESPONSE:\n"; + return ""; } elsif ($qname eq "nxdomain.example.net") { # Do not add a SOA RRset. $packet->header->rcode(NXDOMAIN); @@ -217,8 +221,12 @@ for (;;) { print "TCP request\n"; my $result = handleQuery($buf); $len = length($result); - $conn->syswrite(pack("n", $len), 2); - $n = $conn->syswrite($result, $len); + if ($len != 0) { + $conn->syswrite(pack("n", $len), 2); + $n = $conn->syswrite($result, $len); + } else { + $n = 0; + } print " Sent: $n chars via TCP\n"; } $conn->close; diff --git a/bin/tests/system/resolver/ns1/named.conf.in b/bin/tests/system/resolver/ns1/named.conf.in index fbcb6c1dd3..d212869fa3 100644 --- a/bin/tests/system/resolver/ns1/named.conf.in +++ b/bin/tests/system/resolver/ns1/named.conf.in @@ -28,6 +28,7 @@ options { "gooddname.example.net"; }; allow-query {!10.53.0.8; any; }; max-zone-ttl unlimited; + resolver-query-timeout 5000; # 5 seconds attach-cache "globalcache"; }; diff --git a/bin/tests/system/resolver/tests.sh b/bin/tests/system/resolver/tests.sh index a728806fb3..0e6db14457 100755 --- a/bin/tests/system/resolver/tests.sh +++ b/bin/tests/system/resolver/tests.sh @@ -43,6 +43,28 @@ grep "status: NOERROR" dig.out.ns1.test${n} >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) +# 'resolver-query-timeout' is set to 5 seconds in ns1, so dig with a lower +# timeout value should give up earlier than that. +n=$((n + 1)) +echo_i "checking no response handling with a shorter than resolver-query-timeout timeout ($n)" +ret=0 +dig_with_opts +tcp +tries=1 +timeout=3 noresponse.example.net @10.53.0.1 a >dig.out.ns1.test${n} && ret=1 +grep -F "no servers could be reached" dig.out.ns1.test${n} >/dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status + ret)) + +# 'resolver-query-timeout' is set to 5 seconds in ns1, which is lower than the +# current single query timeout value MAX_SINGLE_QUERY_TIMEOUT of 9 seconds, so +# the "hung fetch" timer should kick in, interrupt the non-responsive query and +# send a SERVFAIL answer. +n=$((n + 1)) +echo_i "checking no response handling with a longer than resolver-query-timeout timeout ($n)" +ret=0 +dig_with_opts +tcp +tries=1 +timeout=7 noresponse.example.net @10.53.0.1 a >dig.out.ns1.test${n} || ret=1 +grep -F "status: SERVFAIL" dig.out.ns1.test${n} >/dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status + ret)) + n=$((n + 1)) echo_i "checking handling of bogus referrals ($n)" # If the server has the "INSIST(!external)" bug, this query will kill it. From 621149c50a9cea0155cac3d7b78abaad7671a9b6 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 6 Jun 2024 09:22:55 +0000 Subject: [PATCH 3/4] Document shorter resolver-query-timeout configuration The lower limit is now 301 milliseconds instead of 10000 milliseconds. --- doc/arm/reference.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/arm/reference.rst b/doc/arm/reference.rst index 6abb4512a3..9dfda0f6fc 100644 --- a/doc/arm/reference.rst +++ b/doc/arm/reference.rst @@ -3132,8 +3132,8 @@ for details on how to specify IP address lists. This is the amount of time in milliseconds that the resolver spends attempting to resolve a recursive query before failing. The default - and minimum is ``10000`` and the maximum is ``30000``. Setting it to - ``0`` results in the default being used. + is ``10000``, the minimum is ``301`` and the maximum is ``30000``. + Setting it to ``0`` results in the default being used. This value was originally specified in seconds. Values less than or equal to 300 are treated as seconds and converted to From 63b787effe3171c87a526c60ab9766a1e6a52bf1 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 6 Jun 2024 19:58:57 +0000 Subject: [PATCH 4/4] Update the resolver unit test Before there was a gap from 301 to 9999 which would be converted to 10000 and now there is no such gap. This settimeout_belowmin test was checking the behavior of a value in the gap. As there is now no gap left, the minimum is 301 and anything below that is converted to seconds as before. In order for this check to still test the "below minimum" behavior, change the value from 9000 to 300. Update the settimeout_overmax value test too so it logically aligns with the minimum value test. --- tests/dns/resolver_test.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/dns/resolver_test.c b/tests/dns/resolver_test.c index 3f53f9f351..f53cfd684a 100644 --- a/tests/dns/resolver_test.c +++ b/tests/dns/resolver_test.c @@ -158,10 +158,10 @@ ISC_LOOP_TEST_IMPL(settimeout_belowmin) { mkres(&resolver); default_timeout = dns_resolver_gettimeout(resolver); - dns_resolver_settimeout(resolver, 9000); + dns_resolver_settimeout(resolver, 300); timeout = dns_resolver_gettimeout(resolver); - assert_int_equal(timeout, default_timeout); + assert_in_range(timeout, default_timeout, 3999999); destroy_resolver(&resolver); isc_loopmgr_shutdown(loopmgr); @@ -170,13 +170,16 @@ ISC_LOOP_TEST_IMPL(settimeout_belowmin) { /* dns_resolver_settimeout over maximum */ ISC_LOOP_TEST_IMPL(settimeout_overmax) { dns_resolver_t *resolver = NULL; - unsigned int timeout; + unsigned int default_timeout, timeout; mkres(&resolver); + default_timeout = dns_resolver_gettimeout(resolver); dns_resolver_settimeout(resolver, 4000000); + timeout = dns_resolver_gettimeout(resolver); - assert_in_range(timeout, 0, 3999999); + assert_in_range(timeout, default_timeout, 3999999); + destroy_resolver(&resolver); isc_loopmgr_shutdown(loopmgr); }