diff --git a/CHANGES b/CHANGES index 2e36700d07..6ffefc54e2 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,11 @@ +6186. [bug] Fix a 'clients-per-query' miscalculation bug. When the + 'stale-answer-enable' options was enabled and the + 'stale-answer-client-timeout' option was enabled and + larger than 0, named was taking two places from the + 'clients-per-query' limit for each client and was + failing to gradually auto-tune its value, as configured. + [GL #4074] + 6185. [func] Add "ClientQuota" statistics channel counter, which indicates the number of the resolver's spilled queries due to reaching the clients per query quota. [GL !7978] diff --git a/bin/tests/system/fetchlimit/ans4/ans.pl b/bin/tests/system/fetchlimit/ans4/ans.pl index 5a265c4725..f44cf8b7e9 100644 --- a/bin/tests/system/fetchlimit/ans4/ans.pl +++ b/bin/tests/system/fetchlimit/ans4/ans.pl @@ -78,6 +78,10 @@ for (;;) { } if ($donotrespond == 0) { + if (index($qname, "latency") == 0) { + # 50ms latency + select(undef, undef, undef, 0.05); + } $sock->send($packet->data); print "RESPONSE:\n"; $packet->print; diff --git a/bin/tests/system/fetchlimit/clean.sh b/bin/tests/system/fetchlimit/clean.sh index 935d91bba8..20bbd60e1d 100644 --- a/bin/tests/system/fetchlimit/clean.sh +++ b/bin/tests/system/fetchlimit/clean.sh @@ -11,9 +11,11 @@ # See the COPYRIGHT file distributed with this work for additional # information regarding copyright ownership. -rm -f */named.conf */named.memstats */ans.run */named.recursing */named.run +rm -f */named.conf */named.memstats */ans.run */named.recursing */named.run */named.run.prev rm -f ans4/norespond rm -f burst.input.* rm -f dig.out* +rm -f wait_for_message.* rm -f ns*/managed-keys.bind* rm -f ns3/named.stats ns3/named.stats.prev ns3/named_dump.db +rm -f ns5/named.stats ns5/named.stats.prev diff --git a/bin/tests/system/fetchlimit/ns5/named1.conf.in b/bin/tests/system/fetchlimit/ns5/named1.conf.in new file mode 100644 index 0000000000..d3c62c225c --- /dev/null +++ b/bin/tests/system/fetchlimit/ns5/named1.conf.in @@ -0,0 +1,46 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +options { + query-source address 10.53.0.5; + notify-source 10.53.0.5; + transfer-source 10.53.0.5; + port @PORT@; + directory "."; + pid-file "named.pid"; + listen-on { 10.53.0.5; }; + listen-on-v6 { none; }; + recursion yes; + dnssec-validation yes; + notify yes; + clients-per-query 5; + max-clients-per-query 10; +}; + +server 10.53.0.4 { + edns no; +}; + +key rndc_key { + secret "1234abcd8765"; + algorithm @DEFAULT_HMAC@; +}; + +controls { + inet 10.53.0.5 port @CONTROLPORT@ allow { any; } keys { rndc_key; }; +}; + +zone "." { + type hint; + file "root.hint"; +}; diff --git a/bin/tests/system/fetchlimit/ns5/named2.conf.in b/bin/tests/system/fetchlimit/ns5/named2.conf.in new file mode 100644 index 0000000000..cb33d30716 --- /dev/null +++ b/bin/tests/system/fetchlimit/ns5/named2.conf.in @@ -0,0 +1,49 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +options { + query-source address 10.53.0.5; + notify-source 10.53.0.5; + transfer-source 10.53.0.5; + port @PORT@; + directory "."; + pid-file "named.pid"; + listen-on { 10.53.0.5; }; + listen-on-v6 { none; }; + recursion yes; + dnssec-validation yes; + notify yes; + stale-answer-enable yes; + stale-cache-enable yes; + stale-answer-client-timeout 1800; + clients-per-query 5; + max-clients-per-query 10; +}; + +server 10.53.0.4 { + edns no; +}; + +key rndc_key { + secret "1234abcd8765"; + algorithm @DEFAULT_HMAC@; +}; + +controls { + inet 10.53.0.5 port @CONTROLPORT@ allow { any; } keys { rndc_key; }; +}; + +zone "." { + type hint; + file "root.hint"; +}; diff --git a/bin/tests/system/fetchlimit/ns5/root.hint b/bin/tests/system/fetchlimit/ns5/root.hint new file mode 100644 index 0000000000..e0f186c2f8 --- /dev/null +++ b/bin/tests/system/fetchlimit/ns5/root.hint @@ -0,0 +1,14 @@ +; Copyright (C) Internet Systems Consortium, Inc. ("ISC") +; +; SPDX-License-Identifier: MPL-2.0 +; +; This Source Code Form is subject to the terms of the Mozilla Public +; License, v. 2.0. If a copy of the MPL was not distributed with this +; file, you can obtain one at https://mozilla.org/MPL/2.0/. +; +; See the COPYRIGHT file distributed with this work for additional +; information regarding copyright ownership. + +$TTL 999999 +. IN NS a.root-servers.nil. +a.root-servers.nil. IN A 10.53.0.1 diff --git a/bin/tests/system/fetchlimit/setup.sh b/bin/tests/system/fetchlimit/setup.sh index d02972fccc..f98749bc75 100644 --- a/bin/tests/system/fetchlimit/setup.sh +++ b/bin/tests/system/fetchlimit/setup.sh @@ -16,3 +16,4 @@ copy_setports ns1/named.conf.in ns1/named.conf copy_setports ns2/named.conf.in ns2/named.conf copy_setports ns3/named1.conf.in ns3/named.conf +copy_setports ns5/named1.conf.in ns5/named.conf diff --git a/bin/tests/system/fetchlimit/tests.sh b/bin/tests/system/fetchlimit/tests.sh index 443556a71f..2938d57b1e 100644 --- a/bin/tests/system/fetchlimit/tests.sh +++ b/bin/tests/system/fetchlimit/tests.sh @@ -14,60 +14,86 @@ . ../conf.sh DIGCMD="$DIG @10.53.0.3 -p ${PORT} +tcp +tries=1 +time=1" -RNDCCMD="$RNDC -p ${CONTROLPORT} -s 10.53.0.3 -c ../common/rndc.conf" + +rndccmd() ( + "$RNDC" -c ../common/rndc.conf -p "${CONTROLPORT}" -s "$@" +) burst() { - num=${3:-20} + server=${1} + num=${4:-20} rm -f burst.input.$$ while [ $num -gt 0 ]; do num=$((num-1)) - echo "${num}${1}${2}.lamesub.example A" >> burst.input.$$ + if [ "${5}" = "dup" ]; then + # burst with duplicate queries + echo "${2}${3}.lamesub.example A" >> burst.input.$$ + else + # burst with unique queries + echo "${num}${2}${3}.lamesub.example A" >> burst.input.$$ + fi done - $PERL ../ditch.pl -p ${PORT} -s 10.53.0.3 burst.input.$$ + $PERL ../ditch.pl -p ${PORT} -s ${server} burst.input.$$ rm -f burst.input.$$ } stat() { - clients=`$RNDCCMD status | grep "recursive clients" | + clients=`rndccmd ${1} status | grep "recursive clients" | sed 's;.*: \([^/][^/]*\)/.*;\1;'` echo_i "clients: $clients" [ "$clients" = "" ] && return 1 - [ "$clients" -ge $1 ] || return 1 - [ "$clients" -le $2 ] || return 1 + [ "$clients" -ge $2 ] || return 1 + [ "$clients" -le $3 ] || return 1 return 0 } +_wait_for_message() ( + nextpartpeek "$1" > wait_for_message.$n + grep -F "$2" wait_for_message.$n >/dev/null +) + +wait_for_message() ( + retry_quiet 20 _wait_for_message "$@" +) + +n=0 status=0 -echo_i "checking recursing clients are dropped at the per-server limit" +n=$((n + 1)) +echo_i "checking recursing clients are dropped at the per-server limit ($n)" ret=0 # make the server lame and restart -$RNDCCMD flush +rndccmd 10.53.0.3 flush touch ans4/norespond for try in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do - burst a $try + burst 10.53.0.3 a $try # fetches-per-server is at 400, but at 20qps against a lame server, # we'll reach 200 at the tenth second, and the quota should have been # tuned to less than that by then. [ $try -le 5 ] && low=$((try*10)) - stat 20 200 || ret=1 + stat 10.53.0.3 20 200 || ret=1 [ $ret -eq 1 ] && break sleep 1 done if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "dumping ADB data" -info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') +n=$((n + 1)) +echo_i "dumping ADB data ($n)" +ret=0 +info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info quota=$2 [ ${quota:-200} -lt 200 ] || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) -echo_i "checking servfail statistics" +n=$((n + 1)) +echo_i "checking servfail statistics ($n)" ret=0 rm -f ns3/named.stats -$RNDCCMD stats +rndccmd 10.53.0.3 stats for try in 1 2 3 4 5; do [ -f ns3/named.stats ] && break sleep 1 @@ -80,32 +106,46 @@ fails=`grep 'queries resulted in SERVFAIL' ns3/named.stats | sed 's/\([0-9][0-9] if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "checking lame server recovery" +n=$((n + 1)) +echo_i "checking lame server recovery ($n)" ret=0 -rm -f ans4/norespond +test -f ans4/norespond && rm -f ans4/norespond for try in 1 2 3 4 5; do - burst b $try - stat 0 200 || ret=1 + burst 10.53.0.3 b $try + stat 10.53.0.3 0 200 || ret=1 [ $ret -eq 1 ] && break sleep 1 done +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) -echo_i "dumping ADB data" -info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') +n=$((n + 1)) +echo_i "dumping ADB data ($n)" +ret=0 +info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info [ ${2:-${quota}} -lt $quota ] || ret=1 quota=$2 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) +n=$((n + 1)) +echo_i "checking lame server recovery (continued) ($n)" +ret=0 for try in 1 2 3 4 5 6 7 8 9 10; do - burst c $try - stat 0 20 || ret=1 + burst 10.53.0.3 c $try + stat 10.53.0.3 0 20 || ret=1 [ $ret -eq 1 ] && break sleep 1 done +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) -echo_i "dumping ADB data" -info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') +n=$((n + 1)) +echo_i "dumping ADB data ($n)" +ret=0 +info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info [ ${2:-${quota}} -gt $quota ] || ret=1 @@ -116,20 +156,21 @@ status=$((status+ret)) copy_setports ns3/named2.conf.in ns3/named.conf rndc_reconfig ns3 10.53.0.3 -echo_i "checking lame server clients are dropped at the per-domain limit" +n=$((n + 1)) +echo_i "checking lame server clients are dropped at the per-domain limit ($n)" ret=0 fail=0 success=0 touch ans4/norespond for try in 1 2 3 4 5; do - burst b $try 300 - $DIGCMD a ${try}.example > dig.out.ns3.$try - grep "status: NOERROR" dig.out.ns3.$try > /dev/null 2>&1 && \ + burst 10.53.0.3 b $try 300 + $DIGCMD a ${try}.example > dig.out.ns3.$n.$try + grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ success=$((success+1)) - grep "status: SERVFAIL" dig.out.ns3.$try > /dev/null 2>&1 && \ + grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ fail=$(($fail+1)) - stat 40 40 || ret=1 - allowed=$($RNDCCMD fetchlimit | awk '/lamesub/ { print $6 }') + stat 10.53.0.3 40 40 || ret=1 + allowed=$(rndccmd 10.53.0.3 fetchlimit | awk '/lamesub/ { print $6 }') [ "${allowed:-0}" -eq 40 ] || ret=1 [ $ret -eq 1 ] && break sleep 1 @@ -138,9 +179,11 @@ echo_i "$success successful valid queries, $fail SERVFAIL" if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "checking drop statistics" +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 rm -f ns3/named.stats -$RNDCCMD stats +rndccmd 10.53.0.3 stats for try in 1 2 3 4 5; do [ -f ns3/named.stats ] && break sleep 1 @@ -156,19 +199,20 @@ status=$((status+ret)) copy_setports ns3/named3.conf.in ns3/named.conf rndc_reconfig ns3 10.53.0.3 -echo_i "checking lame server clients are dropped below the hard limit" +n=$((n + 1)) +echo_i "checking lame server clients are dropped below the hard limit ($n)" ret=0 fail=0 exceeded=0 success=0 touch ans4/norespond for try in 1 2 3 4 5; do - burst b $try 400 - $DIGCMD +time=2 a ${try}.example > dig.out.ns3.$try - stat 1 400 || exceeded=$((exceeded + 1)) - grep "status: NOERROR" dig.out.ns3.$try > /dev/null 2>&1 && \ + burst 10.53.0.3 b $try 400 + $DIGCMD +time=2 a ${try}.example > dig.out.ns3.$n.$try + stat 10.53.0.3 1 400 || exceeded=$((exceeded + 1)) + grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ success=$((success+1)) - grep "status: SERVFAIL" dig.out.ns3.$try > /dev/null 2>&1 && \ + grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ fail=$(($fail+1)) sleep 1 done @@ -181,15 +225,97 @@ echo_i "clients count exceeded 400 on $exceeded trials (expected 0)" if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "checking drop statistics" +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 rm -f ns3/named.stats touch ns3/named.stats -$RNDCCMD stats +rndccmd 10.53.0.3 stats wait_for_log 5 "queries dropped due to recursive client limit" ns3/named.stats || ret=1 drops=`grep 'queries dropped due to recursive client limit' ns3/named.stats | sed 's/\([0-9][0-9]*\) queries.*/\1/'` [ "${drops:-0}" -ne 0 ] || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) +nextpart ns5/named.run >/dev/null + +n=$((n + 1)) +echo_i "checking clients are dropped at the clients-per-query limit ($n)" +ret=0 +test -f ans4/norespond && rm -f ans4/norespond +for try in 1 2 3 4 5; do + burst 10.53.0.5 latency $try 20 "dup" + sleep 1 +done +wait_for_message ns5/named.run "clients-per-query increased to 10" || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 +rm -f ns5/named.stats +rndccmd 10.53.0.5 stats +for try in 1 2 3 4 5; do + [ -f ns5/named.stats ] && break + sleep 1 +done +zspill=`grep 'spilled due to clients per query' ns5/named.stats | sed 's/ *\([0-9][0-9]*\) spilled.*/\1/'` +[ -z "$zspill" ] && zspill=0 +# ns5 configuration: +# clients-per-query 5 +# max-clients-per-query 10 +# expected spills: +# 15 (out of 20) spilled for the first burst, and 10 (out of 20) spilled for +# the next 4 bursts (because of auto-tuning): 15 + (4 * 10) == 55 +expected=55 +[ "$zspill" -eq "$expected" ] || ret=1 +echo_i "$zspill clients spilled (expected $expected)" +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +echo_i "stop ns5" +stop_server --use-rndc --port ${CONTROLPORT} ns5 +copy_setports ns5/named2.conf.in ns5/named.conf +echo_i "start ns5" +start_server --noclean --restart --port ${PORT} ns5 + +nextpart ns5/named.run >/dev/null + +n=$((n + 1)) +echo_i "checking clients are dropped at the clients-per-query limit with stale-answer-client-timeout ($n)" +ret=0 +test -f ans4/norespond && rm -f ans4/norespond +for try in 1 2 3 4 5; do + burst 10.53.0.5 latency $try 20 "dup" + sleep 1 +done +wait_for_message ns5/named.run "clients-per-query increased to 10" || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 +rm -f ns5/named.stats +rndccmd 10.53.0.5 stats +for try in 1 2 3 4 5; do + [ -f ns5/named.stats ] && break + sleep 1 +done +zspill=`grep 'spilled due to clients per query' ns5/named.stats | sed 's/ *\([0-9][0-9]*\) spilled.*/\1/'` +[ -z "$zspill" ] && zspill=0 +# ns5 configuration: +# clients-per-query 5 +# max-clients-per-query 10 +# expected spills: +# 15 (out of 20) spilled for the first burst, and 10 (out of 20) spilled for +# the next 4 bursts (because of auto-tuning): 15 + (4 * 10) == 55 +expected=55 +[ "$zspill" -eq "$expected" ] || ret=1 +echo_i "$zspill clients spilled (expected $expected)" +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + echo_i "exit status: $status" [ $status -eq 0 ] || exit 1 diff --git a/doc/notes/notes-current.rst b/doc/notes/notes-current.rst index 1e3c2e00e9..b96f0ed854 100644 --- a/doc/notes/notes-current.rst +++ b/doc/notes/notes-current.rst @@ -54,7 +54,11 @@ Feature Changes Bug Fixes ~~~~~~~~~ -- None. +- When the :any:`stale-answer-enable` option was enabled and the + :any:`stale-answer-client-timeout` option was enabled and larger than 0, + ``named`` was taking two places from the :any:`clients-per-query` limit for + each client and was failing to gradually auto-tune its value, as configured. + This has been fixed. :gl:`#4074` - It could happen that after the :any:`stale-answer-client-timeout` duration, a delegation from cache was returned to the client. This has now been fixed. diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index ce0d7bc389..e7e3c3914d 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -1553,6 +1553,16 @@ fctx_sendevents(fetchctx_t *fctx, isc_result_t result) { for (resp = ISC_LIST_HEAD(fctx->resps); resp != NULL; resp = next) { next = ISC_LIST_NEXT(resp, link); ISC_LIST_UNLINK(fctx->resps, resp, link); + + /* + * Only the regular fetch events should be counted for the + * clients-per-query limit, in case if there are multiple events + * registered for a single client. + */ + if (resp->type == FETCHDONE) { + count++; + } + if (resp->type == TRYSTALE) { /* * Not applicable to TRYSTALE resps; this function is @@ -1586,7 +1596,6 @@ fctx_sendevents(fetchctx_t *fctx, isc_result_t result) { FCTXTRACE("post response event"); isc_async_run(resp->loop, resp->cb, resp); - count++; } UNLOCK(&fctx->lock); @@ -10391,7 +10400,16 @@ dns_resolver_createfetch(dns_resolver_t *res, const dns_name_t *name, result = DNS_R_DUPLICATE; goto unlock; } - count++; + + /* + * Only the regular fetch events should be + * counted for the clients-per-query limit, in + * case if there are multiple events registered + * for a single client. + */ + if (resp->type == FETCHDONE) { + count++; + } } } if (count >= spillatmin && spillatmin != 0) {