From 069625cccbaf7ae0dc5c80f70c48f59a4af25b14 Mon Sep 17 00:00:00 2001
From: Evan Hunt <each@isc.org>
Date: Mon, 15 Nov 2021 21:59:37 -0800
Subject: [PATCH] fix intermittent resolver test error

the resolver test checks that the correct number of fetches have
been sent NS rrsets of a given size, but it formerly did so by
counting queries received by the authoritative server, which could
result in an off-by-one count if one of the queries had been resent
due to a timeout or a port number collision.

this commit changes the test to count fetches initiated by the
resolver, which should prevent the intermittent test failure, and
is the actual datum we were interested in anyway.
---
 bin/tests/system/resolver/clean.sh |  2 +-
 bin/tests/system/resolver/tests.sh | 30 +++++++++++++++---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/bin/tests/system/resolver/clean.sh b/bin/tests/system/resolver/clean.sh
index b25e8a6e8f..1d32bc122e 100644
--- a/bin/tests/system/resolver/clean.sh
+++ b/bin/tests/system/resolver/clean.sh
@@ -14,7 +14,7 @@
 #
 rm -f */named.conf
 rm -f */named.memstats
-rm -f */named.run
+rm -f */named.run */named.run.prev
 rm -f */ans.run
 rm -f */*.jdb
 rm -f dig.out dig.out.* dig.*.out.*
diff --git a/bin/tests/system/resolver/tests.sh b/bin/tests/system/resolver/tests.sh
index d8485f29ed..4f2cdc4907 100755
--- a/bin/tests/system/resolver/tests.sh
+++ b/bin/tests/system/resolver/tests.sh
@@ -256,10 +256,17 @@ status=`expr $status + $ret`
 
 n=`expr $n + 1`
 echo_i "check that the resolver limits the number of NS records it follows in a referral response ($n)"
-# ns5 is the recusor being tested.  ns4 holds the sourcens zone containing names with varying numbers of NS
-# records pointing to non-existent nameservers in the targetns zone on ns6.
+# ns5 is the recusor being tested.  ns4 holds the sourcens zone containing
+# names with varying numbers of NS records pointing to non-existent
+# nameservers in the targetns zone on ns6.
 ret=0
 $RNDCCMD 10.53.0.5 flush || ret=1   # Ensure cache is empty before doing this test
+count_fetches () {
+        actual=$(nextpartpeek ns5/named.run |
+                  grep " fetch: ns.fake${nscount}" | wc -l)
+        [ ${actual:-0} -eq ${expected} ] || return 1
+        return 0
+}
 for nscount in 1 2 3 4 5 6 7 8 9 10
 do
         # Verify number of NS records at source server
@@ -267,23 +274,16 @@ do
         sourcerecs=`grep NS dig.ns4.out.${nscount}.${n} | grep -v ';' | wc -l`
         test $sourcerecs -eq $nscount || ret=1
         test $sourcerecs -eq $nscount || echo_i "NS count incorrect for target${nscount}.sourcens"
+
         # Expected queries = 2 * number of NS records, up to a maximum of 10.
         expected=`expr 2 \* $nscount`
         if [ $expected -gt 10 ]; then expected=10; fi
-        # Work out the queries made by checking statistics on the target before and after the test
-        $RNDCCMD 10.53.0.6 stats || ret=1
-        initial_count=`awk '/responses sent/ {print $1}' ns6/named.stats`
-        mv ns6/named.stats ns6/named.stats.initial.${nscount}.${n}
+        # Count the number of logged fetches 
+        nextpart ns5/named.run > /dev/null
         $DIG $DIGOPTS @10.53.0.5 target${nscount}.sourcens A > dig.ns5.out.${nscount}.${n} || ret=1
-        $RNDCCMD 10.53.0.6 stats || ret=1
-        final_count=`awk '/responses sent/ {print $1}' ns6/named.stats`
-        mv ns6/named.stats ns6/named.stats.final.${nscount}.${n}
-        # Check number of queries during the test is as expected
-        actual=`expr $final_count - $initial_count`
-        if [ $actual -ne $expected ]; then
-                echo_i "query count error: $nscount NS records: expected queries $expected, actual $actual"
-                ret=1
-        fi
+        retry_quiet 5 count_fetches ns5/named.run $nscount $expected || {
+            echo_i "query count error: $nscount NS records: expected queries $expected, actual $actual"; ret=1;
+        }
 done
 if [ $ret != 0 ]; then echo_i "failed"; fi
 status=`expr $status + $ret`