2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-08-30 05:57:52 +00:00

Merge branch '4074-fix-stale-answer-client-timeout-with-clients-per-query' into 'main'

Fix a clients-per-query miscalculation bug

Closes #4074

See merge request isc-projects/bind9!7977
This commit is contained in:
Arаm Sаrgsyаn 2023-06-01 09:21:10 +00:00
commit 1b0e7e7a50
10 changed files with 318 additions and 46 deletions

View File

@ -1,3 +1,11 @@
6186. [bug] Fix a 'clients-per-query' miscalculation bug. When the
'stale-answer-enable' options was enabled and the
'stale-answer-client-timeout' option was enabled and
larger than 0, named was taking two places from the
'clients-per-query' limit for each client and was
failing to gradually auto-tune its value, as configured.
[GL #4074]
6185. [func] Add "ClientQuota" statistics channel counter, which
indicates the number of the resolver's spilled queries
due to reaching the clients per query quota. [GL !7978]

View File

@ -78,6 +78,10 @@ for (;;) {
}
if ($donotrespond == 0) {
if (index($qname, "latency") == 0) {
# 50ms latency
select(undef, undef, undef, 0.05);
}
$sock->send($packet->data);
print "RESPONSE:\n";
$packet->print;

View File

@ -11,9 +11,11 @@
# See the COPYRIGHT file distributed with this work for additional
# information regarding copyright ownership.
rm -f */named.conf */named.memstats */ans.run */named.recursing */named.run
rm -f */named.conf */named.memstats */ans.run */named.recursing */named.run */named.run.prev
rm -f ans4/norespond
rm -f burst.input.*
rm -f dig.out*
rm -f wait_for_message.*
rm -f ns*/managed-keys.bind*
rm -f ns3/named.stats ns3/named.stats.prev ns3/named_dump.db
rm -f ns5/named.stats ns5/named.stats.prev

View File

@ -0,0 +1,46 @@
/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* SPDX-License-Identifier: MPL-2.0
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
options {
query-source address 10.53.0.5;
notify-source 10.53.0.5;
transfer-source 10.53.0.5;
port @PORT@;
directory ".";
pid-file "named.pid";
listen-on { 10.53.0.5; };
listen-on-v6 { none; };
recursion yes;
dnssec-validation yes;
notify yes;
clients-per-query 5;
max-clients-per-query 10;
};
server 10.53.0.4 {
edns no;
};
key rndc_key {
secret "1234abcd8765";
algorithm @DEFAULT_HMAC@;
};
controls {
inet 10.53.0.5 port @CONTROLPORT@ allow { any; } keys { rndc_key; };
};
zone "." {
type hint;
file "root.hint";
};

View File

@ -0,0 +1,49 @@
/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* SPDX-License-Identifier: MPL-2.0
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
options {
query-source address 10.53.0.5;
notify-source 10.53.0.5;
transfer-source 10.53.0.5;
port @PORT@;
directory ".";
pid-file "named.pid";
listen-on { 10.53.0.5; };
listen-on-v6 { none; };
recursion yes;
dnssec-validation yes;
notify yes;
stale-answer-enable yes;
stale-cache-enable yes;
stale-answer-client-timeout 1800;
clients-per-query 5;
max-clients-per-query 10;
};
server 10.53.0.4 {
edns no;
};
key rndc_key {
secret "1234abcd8765";
algorithm @DEFAULT_HMAC@;
};
controls {
inet 10.53.0.5 port @CONTROLPORT@ allow { any; } keys { rndc_key; };
};
zone "." {
type hint;
file "root.hint";
};

View File

@ -0,0 +1,14 @@
; Copyright (C) Internet Systems Consortium, Inc. ("ISC")
;
; SPDX-License-Identifier: MPL-2.0
;
; This Source Code Form is subject to the terms of the Mozilla Public
; License, v. 2.0. If a copy of the MPL was not distributed with this
; file, you can obtain one at https://mozilla.org/MPL/2.0/.
;
; See the COPYRIGHT file distributed with this work for additional
; information regarding copyright ownership.
$TTL 999999
. IN NS a.root-servers.nil.
a.root-servers.nil. IN A 10.53.0.1

View File

@ -16,3 +16,4 @@
copy_setports ns1/named.conf.in ns1/named.conf
copy_setports ns2/named.conf.in ns2/named.conf
copy_setports ns3/named1.conf.in ns3/named.conf
copy_setports ns5/named1.conf.in ns5/named.conf

View File

@ -14,60 +14,86 @@
. ../conf.sh
DIGCMD="$DIG @10.53.0.3 -p ${PORT} +tcp +tries=1 +time=1"
RNDCCMD="$RNDC -p ${CONTROLPORT} -s 10.53.0.3 -c ../common/rndc.conf"
rndccmd() (
"$RNDC" -c ../common/rndc.conf -p "${CONTROLPORT}" -s "$@"
)
burst() {
num=${3:-20}
server=${1}
num=${4:-20}
rm -f burst.input.$$
while [ $num -gt 0 ]; do
num=$((num-1))
echo "${num}${1}${2}.lamesub.example A" >> burst.input.$$
if [ "${5}" = "dup" ]; then
# burst with duplicate queries
echo "${2}${3}.lamesub.example A" >> burst.input.$$
else
# burst with unique queries
echo "${num}${2}${3}.lamesub.example A" >> burst.input.$$
fi
done
$PERL ../ditch.pl -p ${PORT} -s 10.53.0.3 burst.input.$$
$PERL ../ditch.pl -p ${PORT} -s ${server} burst.input.$$
rm -f burst.input.$$
}
stat() {
clients=`$RNDCCMD status | grep "recursive clients" |
clients=`rndccmd ${1} status | grep "recursive clients" |
sed 's;.*: \([^/][^/]*\)/.*;\1;'`
echo_i "clients: $clients"
[ "$clients" = "" ] && return 1
[ "$clients" -ge $1 ] || return 1
[ "$clients" -le $2 ] || return 1
[ "$clients" -ge $2 ] || return 1
[ "$clients" -le $3 ] || return 1
return 0
}
_wait_for_message() (
nextpartpeek "$1" > wait_for_message.$n
grep -F "$2" wait_for_message.$n >/dev/null
)
wait_for_message() (
retry_quiet 20 _wait_for_message "$@"
)
n=0
status=0
echo_i "checking recursing clients are dropped at the per-server limit"
n=$((n + 1))
echo_i "checking recursing clients are dropped at the per-server limit ($n)"
ret=0
# make the server lame and restart
$RNDCCMD flush
rndccmd 10.53.0.3 flush
touch ans4/norespond
for try in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do
burst a $try
burst 10.53.0.3 a $try
# fetches-per-server is at 400, but at 20qps against a lame server,
# we'll reach 200 at the tenth second, and the quota should have been
# tuned to less than that by then.
[ $try -le 5 ] && low=$((try*10))
stat 20 200 || ret=1
stat 10.53.0.3 20 200 || ret=1
[ $ret -eq 1 ] && break
sleep 1
done
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "dumping ADB data"
info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/')
n=$((n + 1))
echo_i "dumping ADB data ($n)"
ret=0
info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/')
echo_i $info
set -- $info
quota=$2
[ ${quota:-200} -lt 200 ] || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "checking servfail statistics"
n=$((n + 1))
echo_i "checking servfail statistics ($n)"
ret=0
rm -f ns3/named.stats
$RNDCCMD stats
rndccmd 10.53.0.3 stats
for try in 1 2 3 4 5; do
[ -f ns3/named.stats ] && break
sleep 1
@ -80,32 +106,46 @@ fails=`grep 'queries resulted in SERVFAIL' ns3/named.stats | sed 's/\([0-9][0-9]
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "checking lame server recovery"
n=$((n + 1))
echo_i "checking lame server recovery ($n)"
ret=0
rm -f ans4/norespond
test -f ans4/norespond && rm -f ans4/norespond
for try in 1 2 3 4 5; do
burst b $try
stat 0 200 || ret=1
burst 10.53.0.3 b $try
stat 10.53.0.3 0 200 || ret=1
[ $ret -eq 1 ] && break
sleep 1
done
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "dumping ADB data"
info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/')
n=$((n + 1))
echo_i "dumping ADB data ($n)"
ret=0
info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/')
echo_i $info
set -- $info
[ ${2:-${quota}} -lt $quota ] || ret=1
quota=$2
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
n=$((n + 1))
echo_i "checking lame server recovery (continued) ($n)"
ret=0
for try in 1 2 3 4 5 6 7 8 9 10; do
burst c $try
stat 0 20 || ret=1
burst 10.53.0.3 c $try
stat 10.53.0.3 0 20 || ret=1
[ $ret -eq 1 ] && break
sleep 1
done
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "dumping ADB data"
info=$($RNDCCMD fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/')
n=$((n + 1))
echo_i "dumping ADB data ($n)"
ret=0
info=$(rndccmd 10.53.0.3 fetchlimit | grep 10.53.0.4 | sed 's/.*quota .*(\([0-9]*\).*atr \([.0-9]*\).*/\2 \1/')
echo_i $info
set -- $info
[ ${2:-${quota}} -gt $quota ] || ret=1
@ -116,20 +156,21 @@ status=$((status+ret))
copy_setports ns3/named2.conf.in ns3/named.conf
rndc_reconfig ns3 10.53.0.3
echo_i "checking lame server clients are dropped at the per-domain limit"
n=$((n + 1))
echo_i "checking lame server clients are dropped at the per-domain limit ($n)"
ret=0
fail=0
success=0
touch ans4/norespond
for try in 1 2 3 4 5; do
burst b $try 300
$DIGCMD a ${try}.example > dig.out.ns3.$try
grep "status: NOERROR" dig.out.ns3.$try > /dev/null 2>&1 && \
burst 10.53.0.3 b $try 300
$DIGCMD a ${try}.example > dig.out.ns3.$n.$try
grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \
success=$((success+1))
grep "status: SERVFAIL" dig.out.ns3.$try > /dev/null 2>&1 && \
grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \
fail=$(($fail+1))
stat 40 40 || ret=1
allowed=$($RNDCCMD fetchlimit | awk '/lamesub/ { print $6 }')
stat 10.53.0.3 40 40 || ret=1
allowed=$(rndccmd 10.53.0.3 fetchlimit | awk '/lamesub/ { print $6 }')
[ "${allowed:-0}" -eq 40 ] || ret=1
[ $ret -eq 1 ] && break
sleep 1
@ -138,9 +179,11 @@ echo_i "$success successful valid queries, $fail SERVFAIL"
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "checking drop statistics"
n=$((n + 1))
echo_i "checking drop statistics ($n)"
ret=0
rm -f ns3/named.stats
$RNDCCMD stats
rndccmd 10.53.0.3 stats
for try in 1 2 3 4 5; do
[ -f ns3/named.stats ] && break
sleep 1
@ -156,19 +199,20 @@ status=$((status+ret))
copy_setports ns3/named3.conf.in ns3/named.conf
rndc_reconfig ns3 10.53.0.3
echo_i "checking lame server clients are dropped below the hard limit"
n=$((n + 1))
echo_i "checking lame server clients are dropped below the hard limit ($n)"
ret=0
fail=0
exceeded=0
success=0
touch ans4/norespond
for try in 1 2 3 4 5; do
burst b $try 400
$DIGCMD +time=2 a ${try}.example > dig.out.ns3.$try
stat 1 400 || exceeded=$((exceeded + 1))
grep "status: NOERROR" dig.out.ns3.$try > /dev/null 2>&1 && \
burst 10.53.0.3 b $try 400
$DIGCMD +time=2 a ${try}.example > dig.out.ns3.$n.$try
stat 10.53.0.3 1 400 || exceeded=$((exceeded + 1))
grep "status: NOERROR" dig.out.ns3.$n.$try > /dev/null 2>&1 && \
success=$((success+1))
grep "status: SERVFAIL" dig.out.ns3.$try > /dev/null 2>&1 && \
grep "status: SERVFAIL" dig.out.ns3.$n.$try > /dev/null 2>&1 && \
fail=$(($fail+1))
sleep 1
done
@ -181,15 +225,97 @@ echo_i "clients count exceeded 400 on $exceeded trials (expected 0)"
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "checking drop statistics"
n=$((n + 1))
echo_i "checking drop statistics ($n)"
ret=0
rm -f ns3/named.stats
touch ns3/named.stats
$RNDCCMD stats
rndccmd 10.53.0.3 stats
wait_for_log 5 "queries dropped due to recursive client limit" ns3/named.stats || ret=1
drops=`grep 'queries dropped due to recursive client limit' ns3/named.stats | sed 's/\([0-9][0-9]*\) queries.*/\1/'`
[ "${drops:-0}" -ne 0 ] || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
nextpart ns5/named.run >/dev/null
n=$((n + 1))
echo_i "checking clients are dropped at the clients-per-query limit ($n)"
ret=0
test -f ans4/norespond && rm -f ans4/norespond
for try in 1 2 3 4 5; do
burst 10.53.0.5 latency $try 20 "dup"
sleep 1
done
wait_for_message ns5/named.run "clients-per-query increased to 10" || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
n=$((n + 1))
echo_i "checking drop statistics ($n)"
ret=0
rm -f ns5/named.stats
rndccmd 10.53.0.5 stats
for try in 1 2 3 4 5; do
[ -f ns5/named.stats ] && break
sleep 1
done
zspill=`grep 'spilled due to clients per query' ns5/named.stats | sed 's/ *\([0-9][0-9]*\) spilled.*/\1/'`
[ -z "$zspill" ] && zspill=0
# ns5 configuration:
# clients-per-query 5
# max-clients-per-query 10
# expected spills:
# 15 (out of 20) spilled for the first burst, and 10 (out of 20) spilled for
# the next 4 bursts (because of auto-tuning): 15 + (4 * 10) == 55
expected=55
[ "$zspill" -eq "$expected" ] || ret=1
echo_i "$zspill clients spilled (expected $expected)"
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "stop ns5"
stop_server --use-rndc --port ${CONTROLPORT} ns5
copy_setports ns5/named2.conf.in ns5/named.conf
echo_i "start ns5"
start_server --noclean --restart --port ${PORT} ns5
nextpart ns5/named.run >/dev/null
n=$((n + 1))
echo_i "checking clients are dropped at the clients-per-query limit with stale-answer-client-timeout ($n)"
ret=0
test -f ans4/norespond && rm -f ans4/norespond
for try in 1 2 3 4 5; do
burst 10.53.0.5 latency $try 20 "dup"
sleep 1
done
wait_for_message ns5/named.run "clients-per-query increased to 10" || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
n=$((n + 1))
echo_i "checking drop statistics ($n)"
ret=0
rm -f ns5/named.stats
rndccmd 10.53.0.5 stats
for try in 1 2 3 4 5; do
[ -f ns5/named.stats ] && break
sleep 1
done
zspill=`grep 'spilled due to clients per query' ns5/named.stats | sed 's/ *\([0-9][0-9]*\) spilled.*/\1/'`
[ -z "$zspill" ] && zspill=0
# ns5 configuration:
# clients-per-query 5
# max-clients-per-query 10
# expected spills:
# 15 (out of 20) spilled for the first burst, and 10 (out of 20) spilled for
# the next 4 bursts (because of auto-tuning): 15 + (4 * 10) == 55
expected=55
[ "$zspill" -eq "$expected" ] || ret=1
echo_i "$zspill clients spilled (expected $expected)"
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "exit status: $status"
[ $status -eq 0 ] || exit 1

View File

@ -54,7 +54,11 @@ Feature Changes
Bug Fixes
~~~~~~~~~
- None.
- When the :any:`stale-answer-enable` option was enabled and the
:any:`stale-answer-client-timeout` option was enabled and larger than 0,
``named`` was taking two places from the :any:`clients-per-query` limit for
each client and was failing to gradually auto-tune its value, as configured.
This has been fixed. :gl:`#4074`
- It could happen that after the :any:`stale-answer-client-timeout` duration,
a delegation from cache was returned to the client. This has now been fixed.

View File

@ -1553,6 +1553,16 @@ fctx_sendevents(fetchctx_t *fctx, isc_result_t result) {
for (resp = ISC_LIST_HEAD(fctx->resps); resp != NULL; resp = next) {
next = ISC_LIST_NEXT(resp, link);
ISC_LIST_UNLINK(fctx->resps, resp, link);
/*
* Only the regular fetch events should be counted for the
* clients-per-query limit, in case if there are multiple events
* registered for a single client.
*/
if (resp->type == FETCHDONE) {
count++;
}
if (resp->type == TRYSTALE) {
/*
* Not applicable to TRYSTALE resps; this function is
@ -1586,7 +1596,6 @@ fctx_sendevents(fetchctx_t *fctx, isc_result_t result) {
FCTXTRACE("post response event");
isc_async_run(resp->loop, resp->cb, resp);
count++;
}
UNLOCK(&fctx->lock);
@ -10391,7 +10400,16 @@ dns_resolver_createfetch(dns_resolver_t *res, const dns_name_t *name,
result = DNS_R_DUPLICATE;
goto unlock;
}
count++;
/*
* Only the regular fetch events should be
* counted for the clients-per-query limit, in
* case if there are multiple events registered
* for a single client.
*/
if (resp->type == FETCHDONE) {
count++;
}
}
}
if (count >= spillatmin && spillatmin != 0) {