From 0fb4fc1897c4098df08b10e180db843be172d42d Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Sun, 13 Mar 2022 13:47:16 +0000 Subject: [PATCH 1/3] Fix dig error when trying the next server after a TCP connection failure When encountering a TCP connection error while trying to initiate a connection to a server, dig erroneously cancels the lookup even when there are other server(s) to try, which results in an assertion failure. Cancel the lookup only when there are no more queries left in the lookup's queries list (i.e. `next` is NULL). --- bin/dig/dighost.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/dig/dighost.c b/bin/dig/dighost.c index 88d355f100..fa813901c7 100644 --- a/bin/dig/dighost.c +++ b/bin/dig/dighost.c @@ -3293,8 +3293,10 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { next = NULL; } - cancel_lookup(l); query_detach(&query); + if (next == NULL) { + cancel_lookup(l); + } lookup_detach(&l); if (next != NULL) { From 03697f1bccd0c4b3f57f1756be9701023ea3a698 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Sun, 13 Mar 2022 13:47:44 +0000 Subject: [PATCH 2/3] Add various dig/host tests for TCP/UDP socket error handling cases Rework the "ans8" server in the "digdelv" system test to support various modes of operations using a control channel. The supported modes are: 1. `silent` (do not respond) 2. `close` (UDP: same as `silent`; TCP: also close the connection) 3. `servfail` (always respond with `SERVFAIL`) 4. `unstable` (constantly switch between `silent` and `servfail`) Add multiple tests to check the handling of both TCP and UDP socket error scenarios in dig/host. --- bin/tests/system/digdelv/ans8/ans.py | 104 ++++++++++++++++++++------- bin/tests/system/digdelv/tests.sh | 51 +++++++++++-- 2 files changed, 123 insertions(+), 32 deletions(-) diff --git a/bin/tests/system/digdelv/ans8/ans.py b/bin/tests/system/digdelv/ans8/ans.py index 18fbdece3e..333b2f6044 100644 --- a/bin/tests/system/digdelv/ans8/ans.py +++ b/bin/tests/system/digdelv/ans8/ans.py @@ -20,7 +20,27 @@ import struct import dns, dns.message from dns.rcode import * -def create_response(msg): +modes = [ + b"silent", # Do not respond + b"close", # UDP: same as silent; TCP: also close the connection + b"servfail", # Always respond with SERVFAIL + b"unstable", # Constantly switch between "silent" and "servfail" +] +mode = modes[0] +n = 0 + +def ctrl_channel(msg): + global modes, mode, n + + msg = msg.splitlines().pop(0) + print("Received control message: %s" % msg) + + if msg in modes: + mode = msg + n = 0 + print("New mode: %s" % str(mode)) + +def create_servfail(msg): m = dns.message.from_wire(msg) qname = m.question[0].name.to_text() rrtype = m.question[0].rdtype @@ -45,6 +65,9 @@ ip4 = "10.53.0.8" try: port=int(os.environ["PORT"]) except: port=5300 +try: ctrlport=int(os.environ['EXTRAPORT1']) +except: ctrlport=5300 + query4_udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) query4_udp.bind((ip4, port)) @@ -53,6 +76,11 @@ query4_tcp.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) query4_tcp.bind((ip4, port)) query4_tcp.listen(100) +ctrl4_tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +ctrl4_tcp.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +ctrl4_tcp.bind((ip4, ctrlport)) +ctrl4_tcp.listen(100) + signal.signal(signal.SIGTERM, sigterm) f = open("ans.pid", "w") @@ -63,12 +91,11 @@ f.close() running = True print ("Listening on %s port %d" % (ip4, port)) +print ("Listening on %s port %d" % (ip4, ctrlport)) print ("Ctrl-c to quit") -input = [query4_udp, query4_tcp] +input = [query4_udp, query4_tcp, ctrl4_tcp] -n_udp = 0 -n_tcp = 0 hung_conns = [] while running: @@ -83,54 +110,79 @@ while running: for s in inputready: if s == query4_udp: + n = n + 1 print("UDP query received on %s" % ip4, end=" ") - n_udp = n_udp + 1 msg = s.recvfrom(65535) - # Do not response to every other query. - if n_udp % 2 == 1: - print("NO RESPONSE") + if mode == b"silent" or mode == b"close" or (mode == b"unstable" and n % 2 == 1): + # Do not respond. + print("NO RESPONSE (%s)" % str(mode)) continue - rsp = create_response(msg[0]) - if rsp: - print(dns.rcode.to_text(rsp.rcode())) - s.sendto(rsp.to_wire(), msg[1]) + elif mode == b"servfail" or (mode == b"unstable" and n % 2 == 0): + rsp = create_servfail(msg[0]) + if rsp: + print(dns.rcode.to_text(rsp.rcode())) + s.sendto(rsp.to_wire(), msg[1]) + else: + print("NO RESPONSE (can not create a response)") else: - print("NO RESPONSE") + raise(Exception("unsupported mode: %s" % mode)) elif s == query4_tcp: + n = n + 1 print("TCP query received on %s" % ip4, end=" ") - n_tcp = n_tcp + 1 conn = None try: - conn, addr = s.accept() - # Do not response to every other query, hang the connection. - if n_tcp % 2 == 1: - print("NO RESPONSE") + if mode == b"silent" or (mode == b"unstable" and n % 2 == 1): + conn, addr = s.accept() + # Do not respond and hang the connection. + print("NO RESPONSE (%s)" % str(mode)) hung_conns.append(conn) - conn = None continue - else: + elif mode == b"close": + conn, addr = s.accept() + # Do not respond and close the connection. + print("NO RESPONSE (%s)" % str(mode)) + conn.close() + continue + elif mode == b"servfail" or (mode == b"unstable" and n % 2 == 0): + conn, addr = s.accept() # get TCP message length msg = conn.recv(2) if len(msg) != 2: - print("NO RESPONSE") + print("NO RESPONSE (can not read the message length)") conn.close() continue length = struct.unpack('>H', msg[:2])[0] msg = conn.recv(length) if len(msg) != length: - print("NO RESPONSE") + print("NO RESPONSE (can not read the message)") conn.close() continue - rsp = create_response(msg) + rsp = create_servfail(msg) if rsp: print(dns.rcode.to_text(rsp.rcode())) wire = rsp.to_wire() conn.send(struct.pack('>H', len(wire))) conn.send(wire) else: - print("NO RESPONSE") - except: - print("NO RESPONSE") + print("NO RESPONSE (can not create a response)") + else: + raise(Exception("unsupported mode: %s" % mode)) + except socket.error as e: + print("NO RESPONSE (error: %s)" % str(e)) + if conn: + conn.close() + elif s == ctrl4_tcp: + print("Control channel connected") + conn = None + try: + # Handle control channel input + conn, addr = s.accept() + msg = conn.recv(1024) + if msg: + ctrl_channel(msg) + conn.close() + except s.timeout: + pass if conn: conn.close() diff --git a/bin/tests/system/digdelv/tests.sh b/bin/tests/system/digdelv/tests.sh index 7e34527288..2e735c66ea 100644 --- a/bin/tests/system/digdelv/tests.sh +++ b/bin/tests/system/digdelv/tests.sh @@ -1001,6 +1001,8 @@ if [ -x "$DIG" ] ; then # See [GL #3020] for more information n=$((n+1)) echo_i "check that dig handles UDP timeout followed by a SERVFAIL correctly ($n)" + # Ask ans8 to be in "unstable" mode (switching between "silent" and "servfail" modes) + echo "unstable" | sendcmd 10.53.0.8 ret=0 dig_with_opts +timeout=1 +nofail @10.53.0.8 a.example > dig.out.test$n 2>&1 || ret=1 grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 @@ -1009,6 +1011,8 @@ if [ -x "$DIG" ] ; then n=$((n+1)) echo_i "check that dig handles TCP timeout followed by a SERVFAIL correctly ($n)" + # Ask ans8 to be in "unstable" mode (switching between "silent" and "servfail" modes) + echo "unstable" | sendcmd 10.53.0.8 ret=0 dig_with_opts +timeout=1 +nofail +tcp @10.53.0.8 a.example > dig.out.test$n 2>&1 || ret=1 grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 @@ -1016,19 +1020,54 @@ if [ -x "$DIG" ] ; then status=$((status+ret)) n=$((n+1)) - echo_i "check that dig tries the next server after a connection error ($n)" + echo_i "check that dig tries the next server after a UDP socket read error ($n)" ret=0 - dig_with_opts -d @10.53.0.99 @10.53.0.3 a.example > dig.out.test$n 2>&1 || ret=1 + dig_with_opts @10.53.0.99 @10.53.0.3 a.example > dig.out.test$n 2>&1 || ret=1 grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 if [ $ret -ne 0 ]; then echo_i "failed"; fi status=$((status+ret)) n=$((n+1)) - echo_i "check that dig tries the next server after timeouts ($n)" - # Ask ans4 to not respond to queries - echo "//" | sendcmd 10.53.0.4 + echo_i "check that dig tries the next server after a TCP socket read error ($n)" + # Ask ans8 to be in "close" mode, which closes the connection after accepting it + echo "close" | sendcmd 10.53.0.8 ret=0 - dig_with_opts -d @10.53.0.4 @10.53.0.3 a.example > dig.out.test$n 2>&1 || ret=1 + dig_with_opts +tcp @10.53.0.8 @10.53.0.3 a.example > dig.out.test$n 2>&1 || ret=1 + grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 + if [ $ret -ne 0 ]; then echo_i "failed"; fi + status=$((status+ret)) + + # Note that we combine TCP socket "connection error" and "timeout" cases in + # one, because it is not trivial to simulate the timeout case in a system test + # in Linux without a firewall, but the code which handles error cases during + # the connection establishment time does not differentiate between timeout and + # other types of errors (unlike during reading), so this one check should be + # sufficient for both cases. + n=$((n+1)) + echo_i "check that dig tries the next server after a TCP socket connection error/timeout ($n)" + ret=0 + dig_with_opts +tcp @10.53.0.99 @10.53.0.3 a.example > dig.out.test$n 2>&1 || ret=1 + test $(grep "connection refused\|timed out" dig.out.test$n | wc -l) -eq 3 || ret=1 + grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 + if [ $ret -ne 0 ]; then echo_i "failed"; fi + status=$((status+ret)) + + n=$((n+1)) + echo_i "check that dig tries the next server after UDP socket read timeouts ($n)" + # Ask ans8 to be in "silent" mode + echo "silent" | sendcmd 10.53.0.8 + ret=0 + dig_with_opts +timeout=1 @10.53.0.8 @10.53.0.3 a.example > dig.out.test$n 2>&1 || ret=1 + grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 + if [ $ret -ne 0 ]; then echo_i "failed"; fi + status=$((status+ret)) + + n=$((n+1)) + echo_i "check that dig tries the next server after TCP socket read timeouts ($n)" + # Ask ans8 to be in "silent" mode + echo "silent" | sendcmd 10.53.0.8 + ret=0 + dig_with_opts +timeout=1 +tcp @10.53.0.8 @10.53.0.3 a.example > dig.out.test$n 2>&1 || ret=1 grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 if [ $ret -ne 0 ]; then echo_i "failed"; fi status=$((status+ret)) From ced79790b3c6a64202083633741f93e5673071d7 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Sun, 13 Mar 2022 13:53:50 +0000 Subject: [PATCH 3/3] Add CHANGES note for [GL #3205] --- CHANGES | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES b/CHANGES index 061cedd6eb..0e652f6e35 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,8 @@ +5833. [bug] When encountering socket error while trying to initiate + a TCP connection to a server, dig could hang + indefinitely, when there were more servers to try. + [GL #3205] + 5832. [bug] When timing-out or having other types of socket errors during a query, dig wasn't trying to perform the lookup using other servers, in case they exist. [GL #3128]