From 101d829b02b50713c036d6989e64519d090ae796 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Mon, 29 May 2023 14:17:01 +0000 Subject: [PATCH 1/5] Fix fetchlimit system test issues 1. Fix the numbering. 2. Fix an artifacts rewriting issue. 3. Add missing checks of 'ret' after some checks. --- bin/tests/system/fetchlimit/tests.sh | 59 ++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/bin/tests/system/fetchlimit/tests.sh b/bin/tests/system/fetchlimit/tests.sh index 443556a71f..b6e69a2d47 100644 --- a/bin/tests/system/fetchlimit/tests.sh +++ b/bin/tests/system/fetchlimit/tests.sh @@ -37,9 +37,11 @@ stat() { return 0 } +n=0 status=0 -echo_i "checking recursing clients are dropped at the per-server limit" +n=$((n + 1)) +echo_i "checking recursing clients are dropped at the per-server limit ($n)" ret=0 # make the server lame and restart $RNDCCMD flush @@ -57,14 +59,19 @@ done if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "dumping ADB data" +n=$((n + 1)) +echo_i "dumping ADB data ($n)" +ret=0 info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info quota=$2 [ ${quota:-200} -lt 200 ] || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) -echo_i "checking servfail statistics" +n=$((n + 1)) +echo_i "checking servfail statistics ($n)" ret=0 rm -f ns3/named.stats $RNDCCMD stats @@ -80,7 +87,8 @@ fails=`grep 'queries resulted in SERVFAIL' ns3/named.stats | sed 's/\([0-9][0-9] if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "checking lame server recovery" +n=$((n + 1)) +echo_i "checking lame server recovery ($n)" ret=0 rm -f ans4/norespond for try in 1 2 3 4 5; do @@ -89,22 +97,35 @@ for try in 1 2 3 4 5; do [ $ret -eq 1 ] && break sleep 1 done +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) -echo_i "dumping ADB data" +n=$((n + 1)) +echo_i "dumping ADB data ($n)" +ret=0 info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info [ ${2:-${quota}} -lt $quota ] || ret=1 quota=$2 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) +n=$((n + 1)) +echo_i "checking lame server recovery (continued) ($n)" +ret=0 for try in 1 2 3 4 5 6 7 8 9 10; do burst c $try stat 0 20 || ret=1 [ $ret -eq 1 ] && break sleep 1 done +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) -echo_i "dumping ADB data" +n=$((n + 1)) +echo_i "dumping ADB data ($n)" +ret=0 info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info @@ -116,17 +137,18 @@ status=$((status+ret)) copy_setports ns3/named2.conf.in ns3/named.conf rndc_reconfig ns3 10.53.0.3 -echo_i "checking lame server clients are dropped at the per-domain limit" +n=$((n + 1)) +echo_i "checking lame server clients are dropped at the per-domain limit ($n)" ret=0 fail=0 success=0 touch ans4/norespond for try in 1 2 3 4 5; do burst b $try 300 - $DIGCMD a ${try}.example > dig.out.ns3.$try - grep "status: NOERROR" dig.out.ns3.$try > /dev/null 2>&1 && \ + $DIGCMD a ${try}.example > dig.out.ns3.$n.$try + grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ success=$((success+1)) - grep "status: SERVFAIL" dig.out.ns3.$try > /dev/null 2>&1 && \ + grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ fail=$(($fail+1)) stat 40 40 || ret=1 allowed=$($RNDCCMD fetchlimit | awk '/lamesub/ { print $6 }') @@ -138,7 +160,9 @@ echo_i "$success successful valid queries, $fail SERVFAIL" if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "checking drop statistics" +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 rm -f ns3/named.stats $RNDCCMD stats for try in 1 2 3 4 5; do @@ -156,7 +180,8 @@ status=$((status+ret)) copy_setports ns3/named3.conf.in ns3/named.conf rndc_reconfig ns3 10.53.0.3 -echo_i "checking lame server clients are dropped below the hard limit" +n=$((n + 1)) +echo_i "checking lame server clients are dropped below the hard limit ($n)" ret=0 fail=0 exceeded=0 @@ -164,11 +189,11 @@ success=0 touch ans4/norespond for try in 1 2 3 4 5; do burst b $try 400 - $DIGCMD +time=2 a ${try}.example > dig.out.ns3.$try + $DIGCMD +time=2 a ${try}.example > dig.out.ns3.$n.$try stat 1 400 || exceeded=$((exceeded + 1)) - grep "status: NOERROR" dig.out.ns3.$try > /dev/null 2>&1 && \ + grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ success=$((success+1)) - grep "status: SERVFAIL" dig.out.ns3.$try > /dev/null 2>&1 && \ + grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ fail=$(($fail+1)) sleep 1 done @@ -181,7 +206,9 @@ echo_i "clients count exceeded 400 on $exceeded trials (expected 0)" if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) -echo_i "checking drop statistics" +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 rm -f ns3/named.stats touch ns3/named.stats $RNDCCMD stats From 7ebd055c7849f190fdfba99a6e8697f80f917136 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Mon, 29 May 2023 17:34:13 +0000 Subject: [PATCH 2/5] Light refactoring of the fetchlimit system test Prepare the fetchlimit system test for adding a clients-per-query check. Change some functions and commands to accept a destination NS IP address instead of using the hardcoded 10.53.0.3. --- bin/tests/system/fetchlimit/tests.sh | 62 ++++++++++++++++------------ 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/bin/tests/system/fetchlimit/tests.sh b/bin/tests/system/fetchlimit/tests.sh index b6e69a2d47..b7e6201066 100644 --- a/bin/tests/system/fetchlimit/tests.sh +++ b/bin/tests/system/fetchlimit/tests.sh @@ -14,26 +14,36 @@ . ../conf.sh DIGCMD="$DIG @10.53.0.3 -p ${PORT} +tcp +tries=1 +time=1" -RNDCCMD="$RNDC -p ${CONTROLPORT} -s 10.53.0.3 -c ../common/rndc.conf" + +rndccmd() ( + "$RNDC" -c ../common/rndc.conf -p "${CONTROLPORT}" -s "$@" +) burst() { - num=${3:-20} + server=${1} + num=${4:-20} rm -f burst.input.$$ while [ $num -gt 0 ]; do num=$((num-1)) - echo "${num}${1}${2}.lamesub.example A" >> burst.input.$$ + if [ "${5}" == "dup" ]; then + # burst with duplicate queries + echo "${2}${3}.lamesub.example A" >> burst.input.$$ + else + # burst with unique queries + echo "${num}${2}${3}.lamesub.example A" >> burst.input.$$ + fi done - $PERL ../ditch.pl -p ${PORT} -s 10.53.0.3 burst.input.$$ + $PERL ../ditch.pl -p ${PORT} -s ${server} burst.input.$$ rm -f burst.input.$$ } stat() { - clients=`$RNDCCMD status | grep "recursive clients" | + clients=`rndccmd ${1} status | grep "recursive clients" | sed 's;.*: \([^/][^/]*\)/.*;\1;'` echo_i "clients: $clients" [ "$clients" = "" ] && return 1 - [ "$clients" -ge $1 ] || return 1 - [ "$clients" -le $2 ] || return 1 + [ "$clients" -ge $2 ] || return 1 + [ "$clients" -le $3 ] || return 1 return 0 } @@ -44,15 +54,15 @@ n=$((n + 1)) echo_i "checking recursing clients are dropped at the per-server limit ($n)" ret=0 # make the server lame and restart -$RNDCCMD flush +rndccmd 10.53.0.3 flush touch ans4/norespond for try in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do - burst a $try + burst 10.53.0.3 a $try # fetches-per-server is at 400, but at 20qps against a lame server, # we'll reach 200 at the tenth second, and the quota should have been # tuned to less than that by then. [ $try -le 5 ] && low=$((try*10)) - stat 20 200 || ret=1 + stat 10.53.0.3 20 200 || ret=1 [ $ret -eq 1 ] && break sleep 1 done @@ -62,7 +72,7 @@ status=$((status+ret)) n=$((n + 1)) echo_i "dumping ADB data ($n)" ret=0 -info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') +info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info quota=$2 @@ -74,7 +84,7 @@ n=$((n + 1)) echo_i "checking servfail statistics ($n)" ret=0 rm -f ns3/named.stats -$RNDCCMD stats +rndccmd 10.53.0.3 stats for try in 1 2 3 4 5; do [ -f ns3/named.stats ] && break sleep 1 @@ -90,10 +100,10 @@ status=$((status+ret)) n=$((n + 1)) echo_i "checking lame server recovery ($n)" ret=0 -rm -f ans4/norespond +test -f ans4/norespond && rm -f ans4/norespond for try in 1 2 3 4 5; do - burst b $try - stat 0 200 || ret=1 + burst 10.53.0.3 b $try + stat 10.53.0.3 0 200 || ret=1 [ $ret -eq 1 ] && break sleep 1 done @@ -103,7 +113,7 @@ status=$((status+ret)) n=$((n + 1)) echo_i "dumping ADB data ($n)" ret=0 -info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') +info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info [ ${2:-${quota}} -lt $quota ] || ret=1 @@ -115,8 +125,8 @@ n=$((n + 1)) echo_i "checking lame server recovery (continued) ($n)" ret=0 for try in 1 2 3 4 5 6 7 8 9 10; do - burst c $try - stat 0 20 || ret=1 + burst 10.53.0.3 c $try + stat 10.53.0.3 0 20 || ret=1 [ $ret -eq 1 ] && break sleep 1 done @@ -126,7 +136,7 @@ status=$((status+ret)) n=$((n + 1)) echo_i "dumping ADB data ($n)" ret=0 -info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') +info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/') echo_i $info set -- $info [ ${2:-${quota}} -gt $quota ] || ret=1 @@ -144,14 +154,14 @@ fail=0 success=0 touch ans4/norespond for try in 1 2 3 4 5; do - burst b $try 300 + burst 10.53.0.3 b $try 300 $DIGCMD a ${try}.example > dig.out.ns3.$n.$try grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ success=$((success+1)) grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ fail=$(($fail+1)) - stat 40 40 || ret=1 - allowed=$($RNDCCMD fetchlimit | awk '/lamesub/ { print $6 }') + stat 10.53.0.3 40 40 || ret=1 + allowed=$(rndccmd 10.53.0.3 fetchlimit | awk '/lamesub/ { print $6 }') [ "${allowed:-0}" -eq 40 ] || ret=1 [ $ret -eq 1 ] && break sleep 1 @@ -164,7 +174,7 @@ n=$((n + 1)) echo_i "checking drop statistics ($n)" ret=0 rm -f ns3/named.stats -$RNDCCMD stats +rndccmd 10.53.0.3 stats for try in 1 2 3 4 5; do [ -f ns3/named.stats ] && break sleep 1 @@ -188,9 +198,9 @@ exceeded=0 success=0 touch ans4/norespond for try in 1 2 3 4 5; do - burst b $try 400 + burst 10.53.0.3 b $try 400 $DIGCMD +time=2 a ${try}.example > dig.out.ns3.$n.$try - stat 1 400 || exceeded=$((exceeded + 1)) + stat 10.53.0.3 1 400 || exceeded=$((exceeded + 1)) grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ success=$((success+1)) grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \ @@ -211,7 +221,7 @@ echo_i "checking drop statistics ($n)" ret=0 rm -f ns3/named.stats touch ns3/named.stats -$RNDCCMD stats +rndccmd 10.53.0.3 stats wait_for_log 5 "queries dropped due to recursive client limit" ns3/named.stats || ret=1 drops=`grep 'queries dropped due to recursive client limit' ns3/named.stats | sed 's/\([0-9][0-9]*\) queries.*/\1/'` [ "${drops:-0}" -ne 0 ] || ret=1 From 3bb2babcd075df1da244e8c6f178e1cb99d3e123 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Mon, 29 May 2023 17:47:55 +0000 Subject: [PATCH 3/5] Add clients-per-query checks for the fetchlimit system test Check if clients-per-query quota works as expected with or without a positive stale-answer-client-timeout value and serve-stale answers enabled. --- bin/tests/system/fetchlimit/ans4/ans.pl | 4 + bin/tests/system/fetchlimit/clean.sh | 4 +- .../system/fetchlimit/ns5/named1.conf.in | 46 ++++++++++ .../system/fetchlimit/ns5/named2.conf.in | 49 ++++++++++ bin/tests/system/fetchlimit/ns5/root.hint | 14 +++ bin/tests/system/fetchlimit/setup.sh | 1 + bin/tests/system/fetchlimit/tests.sh | 91 ++++++++++++++++++- 7 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 bin/tests/system/fetchlimit/ns5/named1.conf.in create mode 100644 bin/tests/system/fetchlimit/ns5/named2.conf.in create mode 100644 bin/tests/system/fetchlimit/ns5/root.hint diff --git a/bin/tests/system/fetchlimit/ans4/ans.pl b/bin/tests/system/fetchlimit/ans4/ans.pl index 5a265c4725..f44cf8b7e9 100644 --- a/bin/tests/system/fetchlimit/ans4/ans.pl +++ b/bin/tests/system/fetchlimit/ans4/ans.pl @@ -78,6 +78,10 @@ for (;;) { } if ($donotrespond == 0) { + if (index($qname, "latency") == 0) { + # 50ms latency + select(undef, undef, undef, 0.05); + } $sock->send($packet->data); print "RESPONSE:\n"; $packet->print; diff --git a/bin/tests/system/fetchlimit/clean.sh b/bin/tests/system/fetchlimit/clean.sh index 935d91bba8..20bbd60e1d 100644 --- a/bin/tests/system/fetchlimit/clean.sh +++ b/bin/tests/system/fetchlimit/clean.sh @@ -11,9 +11,11 @@ # See the COPYRIGHT file distributed with this work for additional # information regarding copyright ownership. -rm -f */named.conf */named.memstats */ans.run */named.recursing */named.run +rm -f */named.conf */named.memstats */ans.run */named.recursing */named.run */named.run.prev rm -f ans4/norespond rm -f burst.input.* rm -f dig.out* +rm -f wait_for_message.* rm -f ns*/managed-keys.bind* rm -f ns3/named.stats ns3/named.stats.prev ns3/named_dump.db +rm -f ns5/named.stats ns5/named.stats.prev diff --git a/bin/tests/system/fetchlimit/ns5/named1.conf.in b/bin/tests/system/fetchlimit/ns5/named1.conf.in new file mode 100644 index 0000000000..d3c62c225c --- /dev/null +++ b/bin/tests/system/fetchlimit/ns5/named1.conf.in @@ -0,0 +1,46 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +options { + query-source address 10.53.0.5; + notify-source 10.53.0.5; + transfer-source 10.53.0.5; + port @PORT@; + directory "."; + pid-file "named.pid"; + listen-on { 10.53.0.5; }; + listen-on-v6 { none; }; + recursion yes; + dnssec-validation yes; + notify yes; + clients-per-query 5; + max-clients-per-query 10; +}; + +server 10.53.0.4 { + edns no; +}; + +key rndc_key { + secret "1234abcd8765"; + algorithm @DEFAULT_HMAC@; +}; + +controls { + inet 10.53.0.5 port @CONTROLPORT@ allow { any; } keys { rndc_key; }; +}; + +zone "." { + type hint; + file "root.hint"; +}; diff --git a/bin/tests/system/fetchlimit/ns5/named2.conf.in b/bin/tests/system/fetchlimit/ns5/named2.conf.in new file mode 100644 index 0000000000..cb33d30716 --- /dev/null +++ b/bin/tests/system/fetchlimit/ns5/named2.conf.in @@ -0,0 +1,49 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +options { + query-source address 10.53.0.5; + notify-source 10.53.0.5; + transfer-source 10.53.0.5; + port @PORT@; + directory "."; + pid-file "named.pid"; + listen-on { 10.53.0.5; }; + listen-on-v6 { none; }; + recursion yes; + dnssec-validation yes; + notify yes; + stale-answer-enable yes; + stale-cache-enable yes; + stale-answer-client-timeout 1800; + clients-per-query 5; + max-clients-per-query 10; +}; + +server 10.53.0.4 { + edns no; +}; + +key rndc_key { + secret "1234abcd8765"; + algorithm @DEFAULT_HMAC@; +}; + +controls { + inet 10.53.0.5 port @CONTROLPORT@ allow { any; } keys { rndc_key; }; +}; + +zone "." { + type hint; + file "root.hint"; +}; diff --git a/bin/tests/system/fetchlimit/ns5/root.hint b/bin/tests/system/fetchlimit/ns5/root.hint new file mode 100644 index 0000000000..e0f186c2f8 --- /dev/null +++ b/bin/tests/system/fetchlimit/ns5/root.hint @@ -0,0 +1,14 @@ +; Copyright (C) Internet Systems Consortium, Inc. ("ISC") +; +; SPDX-License-Identifier: MPL-2.0 +; +; This Source Code Form is subject to the terms of the Mozilla Public +; License, v. 2.0. If a copy of the MPL was not distributed with this +; file, you can obtain one at https://mozilla.org/MPL/2.0/. +; +; See the COPYRIGHT file distributed with this work for additional +; information regarding copyright ownership. + +$TTL 999999 +. IN NS a.root-servers.nil. +a.root-servers.nil. IN A 10.53.0.1 diff --git a/bin/tests/system/fetchlimit/setup.sh b/bin/tests/system/fetchlimit/setup.sh index d02972fccc..f98749bc75 100644 --- a/bin/tests/system/fetchlimit/setup.sh +++ b/bin/tests/system/fetchlimit/setup.sh @@ -16,3 +16,4 @@ copy_setports ns1/named.conf.in ns1/named.conf copy_setports ns2/named.conf.in ns2/named.conf copy_setports ns3/named1.conf.in ns3/named.conf +copy_setports ns5/named1.conf.in ns5/named.conf diff --git a/bin/tests/system/fetchlimit/tests.sh b/bin/tests/system/fetchlimit/tests.sh index b7e6201066..2938d57b1e 100644 --- a/bin/tests/system/fetchlimit/tests.sh +++ b/bin/tests/system/fetchlimit/tests.sh @@ -25,7 +25,7 @@ burst() { rm -f burst.input.$$ while [ $num -gt 0 ]; do num=$((num-1)) - if [ "${5}" == "dup" ]; then + if [ "${5}" = "dup" ]; then # burst with duplicate queries echo "${2}${3}.lamesub.example A" >> burst.input.$$ else @@ -47,6 +47,15 @@ stat() { return 0 } +_wait_for_message() ( + nextpartpeek "$1" > wait_for_message.$n + grep -F "$2" wait_for_message.$n >/dev/null +) + +wait_for_message() ( + retry_quiet 20 _wait_for_message "$@" +) + n=0 status=0 @@ -228,5 +237,85 @@ drops=`grep 'queries dropped due to recursive client limit' ns3/named.stats | se if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) +nextpart ns5/named.run >/dev/null + +n=$((n + 1)) +echo_i "checking clients are dropped at the clients-per-query limit ($n)" +ret=0 +test -f ans4/norespond && rm -f ans4/norespond +for try in 1 2 3 4 5; do + burst 10.53.0.5 latency $try 20 "dup" + sleep 1 +done +wait_for_message ns5/named.run "clients-per-query increased to 10" || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 +rm -f ns5/named.stats +rndccmd 10.53.0.5 stats +for try in 1 2 3 4 5; do + [ -f ns5/named.stats ] && break + sleep 1 +done +zspill=`grep 'spilled due to clients per query' ns5/named.stats | sed 's/ *\([0-9][0-9]*\) spilled.*/\1/'` +[ -z "$zspill" ] && zspill=0 +# ns5 configuration: +# clients-per-query 5 +# max-clients-per-query 10 +# expected spills: +# 15 (out of 20) spilled for the first burst, and 10 (out of 20) spilled for +# the next 4 bursts (because of auto-tuning): 15 + (4 * 10) == 55 +expected=55 +[ "$zspill" -eq "$expected" ] || ret=1 +echo_i "$zspill clients spilled (expected $expected)" +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +echo_i "stop ns5" +stop_server --use-rndc --port ${CONTROLPORT} ns5 +copy_setports ns5/named2.conf.in ns5/named.conf +echo_i "start ns5" +start_server --noclean --restart --port ${PORT} ns5 + +nextpart ns5/named.run >/dev/null + +n=$((n + 1)) +echo_i "checking clients are dropped at the clients-per-query limit with stale-answer-client-timeout ($n)" +ret=0 +test -f ans4/norespond && rm -f ans4/norespond +for try in 1 2 3 4 5; do + burst 10.53.0.5 latency $try 20 "dup" + sleep 1 +done +wait_for_message ns5/named.run "clients-per-query increased to 10" || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n + 1)) +echo_i "checking drop statistics ($n)" +ret=0 +rm -f ns5/named.stats +rndccmd 10.53.0.5 stats +for try in 1 2 3 4 5; do + [ -f ns5/named.stats ] && break + sleep 1 +done +zspill=`grep 'spilled due to clients per query' ns5/named.stats | sed 's/ *\([0-9][0-9]*\) spilled.*/\1/'` +[ -z "$zspill" ] && zspill=0 +# ns5 configuration: +# clients-per-query 5 +# max-clients-per-query 10 +# expected spills: +# 15 (out of 20) spilled for the first burst, and 10 (out of 20) spilled for +# the next 4 bursts (because of auto-tuning): 15 + (4 * 10) == 55 +expected=55 +[ "$zspill" -eq "$expected" ] || ret=1 +echo_i "$zspill clients spilled (expected $expected)" +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + echo_i "exit status: $status" [ $status -eq 0 ] || exit 1 From 2ae5c4a674ef2d43c145c8486becc92aa26fff58 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Sat, 27 May 2023 11:01:28 +0000 Subject: [PATCH 4/5] Fix a clients-per-query miscalculation bug The number of clients per query is calculated using the pending fetch responses in the list. The dns_resolver_createfetch() function includes every item in the list when deciding whether the limit is reached (i.e. fctx->spilled is true). Then, when the limit is reached, there is another calculation in fctx_sendevents(), when deciding whether it is needed to increase the limit, but this time the TRYSTALE responses are not included in the calculation (because of early break from the loop), and because of that the limit is never increased. A single client can have more than one associated response/event in the list (currently max. two), and calculating them as separate "clients" is unexpected. E.g. if 'stale-answer-enable' is enabled and 'stale-answer-client-timeout' is enabled and is larger than 0, then each client will have two events, which will effectively halve the clients-per-query limit. Fix the dns_resolver_createfetch() function to calculate only the regular FETCHDONE responses/events. Change the fctx_sendevents() function to also calculate only FETCHDONE responses/events. Currently, this second change doesn't have any impact, because the TRYSTALE events were already skipped, but having the same condition in both places will help prevent similar bugs in the future if a new type of response/event is ever added. --- lib/dns/resolver.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index ce0d7bc389..e7e3c3914d 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -1553,6 +1553,16 @@ fctx_sendevents(fetchctx_t *fctx, isc_result_t result) { for (resp = ISC_LIST_HEAD(fctx->resps); resp != NULL; resp = next) { next = ISC_LIST_NEXT(resp, link); ISC_LIST_UNLINK(fctx->resps, resp, link); + + /* + * Only the regular fetch events should be counted for the + * clients-per-query limit, in case if there are multiple events + * registered for a single client. + */ + if (resp->type == FETCHDONE) { + count++; + } + if (resp->type == TRYSTALE) { /* * Not applicable to TRYSTALE resps; this function is @@ -1586,7 +1596,6 @@ fctx_sendevents(fetchctx_t *fctx, isc_result_t result) { FCTXTRACE("post response event"); isc_async_run(resp->loop, resp->cb, resp); - count++; } UNLOCK(&fctx->lock); @@ -10391,7 +10400,16 @@ dns_resolver_createfetch(dns_resolver_t *res, const dns_name_t *name, result = DNS_R_DUPLICATE; goto unlock; } - count++; + + /* + * Only the regular fetch events should be + * counted for the clients-per-query limit, in + * case if there are multiple events registered + * for a single client. + */ + if (resp->type == FETCHDONE) { + count++; + } } } if (count >= spillatmin && spillatmin != 0) { From 466a7d9b5fa288b16f61bde77d8aadf9fae18b3d Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Sat, 27 May 2023 11:30:56 +0000 Subject: [PATCH 5/5] Add CHANGES and release notes for [GL #4074] --- CHANGES | 8 ++++++++ doc/notes/notes-current.rst | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 2e36700d07..6ffefc54e2 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,11 @@ +6186. [bug] Fix a 'clients-per-query' miscalculation bug. When the + 'stale-answer-enable' options was enabled and the + 'stale-answer-client-timeout' option was enabled and + larger than 0, named was taking two places from the + 'clients-per-query' limit for each client and was + failing to gradually auto-tune its value, as configured. + [GL #4074] + 6185. [func] Add "ClientQuota" statistics channel counter, which indicates the number of the resolver's spilled queries due to reaching the clients per query quota. [GL !7978] diff --git a/doc/notes/notes-current.rst b/doc/notes/notes-current.rst index 1e3c2e00e9..b96f0ed854 100644 --- a/doc/notes/notes-current.rst +++ b/doc/notes/notes-current.rst @@ -54,7 +54,11 @@ Feature Changes Bug Fixes ~~~~~~~~~ -- None. +- When the :any:`stale-answer-enable` option was enabled and the + :any:`stale-answer-client-timeout` option was enabled and larger than 0, + ``named`` was taking two places from the :any:`clients-per-query` limit for + each client and was failing to gradually auto-tune its value, as configured. + This has been fixed. :gl:`#4074` - It could happen that after the :any:`stale-answer-client-timeout` duration, a delegation from cache was returned to the client. This has now been fixed.