From 5d34a14f22c34267dfa4b83ea78af2b3ab4dcffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Tue, 5 Oct 2021 22:30:55 +0200 Subject: [PATCH 1/3] Set minimum MTU (1280) on IPv6 sockets The IPV6_USE_MIN_MTU socket option directs the IP layer to limit the IPv6 packet size to the minimum required supported MTU from the base IPv6 specification, i.e. 1280 bytes. Many implementations of TCP running over IPv6 neglect to check the IPV6_USE_MIN_MTU value when performing MSS negotiation and when constructing a TCP segment despite MSS being defined to be the MTU less the IP and TCP header sizes (60 bytes for IPv6). This leads to oversized IPv6 packets being sent resulting in unintended Path Maximum Transport Unit Discovery (PMTUD) being performed and to fragmented IPv6 packets being sent. Add and use a function to set socket option to limit the MTU on IPv6 sockets to the minimum MTU (1280) both for UDP and TCP. --- lib/isc/netmgr/netmgr-int.h | 6 ++++++ lib/isc/netmgr/netmgr.c | 21 +++++++++++++++++++++ lib/isc/netmgr/tcp.c | 4 +++- lib/isc/netmgr/tcpdns.c | 4 +++- lib/isc/netmgr/tlsdns.c | 4 +++- lib/isc/netmgr/udp.c | 4 ++++ 6 files changed, 40 insertions(+), 3 deletions(-) diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index a6c8b0a8a6..03897e77bf 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -1863,6 +1863,12 @@ isc__nm_socket_tcp_nodelay(uv_os_sock_t fd); * Disables Nagle's algorithm on a TCP socket (sets TCP_NODELAY). */ +isc_result_t +isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family); +/*%< + * Use minimum MTU on IPv6 sockets + */ + void isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle); /*%> diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index eafce4ec1c..8f5589cd6e 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -3263,6 +3263,27 @@ isc__nm_socket_tcp_nodelay(uv_os_sock_t fd) { #endif } +isc_result_t +isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) { + if (sa_family != AF_INET6) { + return (ISC_R_SUCCESS); + } +#ifdef IPV6_USE_MIN_MTU + if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU) == -1) { + return (ISC_R_FAILURE); + } +#elif defined(IPV6_MTU) + if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU, &(int){ 1280 }, + sizeof(int)) == -1) { + return (ISC_R_FAILURE); + } +#else + UNUSED(fd); +#endif + + return (ISC_R_SUCCESS); +} + void isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle) { int32_t recv_buffer_size = 0; diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index b033e71cc8..4e187d31e2 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -342,6 +342,8 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, return; } + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + ievent = isc__nm_get_netievent_tcpconnect(mgr, sock, req); if (isc__nm_in_netthread()) { @@ -525,7 +527,7 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->parent != NULL); REQUIRE(sock->tid == isc_nm_tid()); - /* TODO: set min mss */ + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); UV_RUNTIME_CHECK(uv_tcp_init, r); diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c index fcce42df97..1b84b50da7 100644 --- a/lib/isc/netmgr/tcpdns.c +++ b/lib/isc/netmgr/tcpdns.c @@ -297,6 +297,8 @@ isc_nm_tcpdnsconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, return; } + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + /* 2 minute timeout */ result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); RUNTIME_CHECK(result == ISC_R_SUCCESS); @@ -489,7 +491,7 @@ isc__nm_async_tcpdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->parent != NULL); REQUIRE(sock->tid == isc_nm_tid()); - /* TODO: set min mss */ + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); UV_RUNTIME_CHECK(uv_tcp_init, r); diff --git a/lib/isc/netmgr/tlsdns.c b/lib/isc/netmgr/tlsdns.c index af5362f7e6..369eac1497 100644 --- a/lib/isc/netmgr/tlsdns.c +++ b/lib/isc/netmgr/tlsdns.c @@ -351,6 +351,8 @@ isc_nm_tlsdnsconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, goto failure; } + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + /* 2 minute timeout */ result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); RUNTIME_CHECK(result == ISC_R_SUCCESS); @@ -560,7 +562,7 @@ isc__nm_async_tlsdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->parent != NULL); REQUIRE(sock->tid == isc_nm_tid()); - /* TODO: set min mss */ + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); UV_RUNTIME_CHECK(uv_tcp_init, r); diff --git a/lib/isc/netmgr/udp.c b/lib/isc/netmgr/udp.c index e8ef4100f1..624932bc98 100644 --- a/lib/isc/netmgr/udp.c +++ b/lib/isc/netmgr/udp.c @@ -439,6 +439,8 @@ isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->parent != NULL); REQUIRE(sock->tid == isc_nm_tid()); + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + #if HAVE_DECL_UV_UDP_RECVMMSG uv_init_flags |= UV_UDP_RECVMMSG; #endif @@ -1031,6 +1033,8 @@ isc_nm_udpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, (void)isc__nm_socket_disable_pmtud(sock->fd, sa_family); + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + event = isc__nm_get_netievent_udpconnect(mgr, sock, req); if (isc__nm_in_netthread()) { From 8098a585818f2a2cf095994bed2bec51822fa0b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Tue, 5 Oct 2021 22:30:55 +0200 Subject: [PATCH 2/3] Set TCP maximum segment size to minimum size of 1220 Previously the socket code would set the TCPv6 maximum segment size to minimum value to prevent IP fragmentation for TCP. This was not yet implemented for the network manager. Implement network manager functions to set and use minimum MTU socket option and set the TCP_MAXSEG socket option for both IPv4 and IPv6 and use those to clamp the TCP maximum segment size for TCP, TCPDNS and TLSDNS layers in the network manager to 1220 bytes, that is 1280 (IPv6 minimum link MTU) minus 40 (IPv6 fixed header) minus 20 (TCP fixed header) We already rely on a similar value for UDP to prevent IP fragmentation and it make sense to use the same value for IPv4 and IPv6 because the modern networks are required to support IPv6 packet sizes. If there's need for small TCP segment values, the MTU on the interfaces needs to be properly configured. --- lib/isc/netmgr/netmgr-int.h | 14 ++++++++++++++ lib/isc/netmgr/netmgr.c | 16 ++++++++++++++++ lib/isc/netmgr/tcp.c | 2 ++ lib/isc/netmgr/tcpdns.c | 2 ++ lib/isc/netmgr/tlsdns.c | 2 ++ 5 files changed, 36 insertions(+) diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index 03897e77bf..596423458b 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -103,6 +103,14 @@ STATIC_ASSERT(ISC_NETMGR_TCP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE, */ #define NM_BIG_BUF ISC_NETMGR_TCP_RECVBUF_SIZE * 2 +/*% + * Maximum segment size (MSS) of TCP socket on which the server responds to + * queries. Value lower than common MSS on Ethernet (1220, that is 1280 (IPv6 + * minimum link MTU) - 40 (IPv6 fixed header) - 20 (TCP fixed header)) will + * address path MTU problem. + */ +#define NM_MAXSEG (1280 - 20 - 40) + #if defined(SO_REUSEPORT_LB) || (defined(SO_REUSEPORT) && defined(__linux__)) #define HAVE_SO_REUSEPORT_LB 1 #endif @@ -1863,6 +1871,12 @@ isc__nm_socket_tcp_nodelay(uv_os_sock_t fd); * Disables Nagle's algorithm on a TCP socket (sets TCP_NODELAY). */ +isc_result_t +isc__nm_socket_tcp_maxseg(uv_os_sock_t fd, int size); +/*%< + * Set the TCP maximum segment size + */ + isc_result_t isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family); /*%< diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index 8f5589cd6e..f23be6bca9 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -3263,6 +3263,22 @@ isc__nm_socket_tcp_nodelay(uv_os_sock_t fd) { #endif } +isc_result_t +isc__nm_socket_tcp_maxseg(uv_os_sock_t fd, int size) { +#ifdef TCP_MAXSEG + if (setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *)&size, + sizeof(size))) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); + UNUSED(size); + return (ISC_R_SUCCESS); +#endif +} + isc_result_t isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) { if (sa_family != AF_INET6) { diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index 4e187d31e2..c924c25e6c 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -343,6 +343,7 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, } (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); ievent = isc__nm_get_netievent_tcpconnect(mgr, sock, req); @@ -528,6 +529,7 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->tid == isc_nm_tid()); (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); UV_RUNTIME_CHECK(uv_tcp_init, r); diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c index 1b84b50da7..d57e2a149d 100644 --- a/lib/isc/netmgr/tcpdns.c +++ b/lib/isc/netmgr/tcpdns.c @@ -298,6 +298,7 @@ isc_nm_tcpdnsconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, } (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); /* 2 minute timeout */ result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); @@ -492,6 +493,7 @@ isc__nm_async_tcpdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->tid == isc_nm_tid()); (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); UV_RUNTIME_CHECK(uv_tcp_init, r); diff --git a/lib/isc/netmgr/tlsdns.c b/lib/isc/netmgr/tlsdns.c index 369eac1497..1e93a92c5e 100644 --- a/lib/isc/netmgr/tlsdns.c +++ b/lib/isc/netmgr/tlsdns.c @@ -352,6 +352,7 @@ isc_nm_tlsdnsconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, } (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); /* 2 minute timeout */ result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); @@ -563,6 +564,7 @@ isc__nm_async_tlsdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->tid == isc_nm_tid()); (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); UV_RUNTIME_CHECK(uv_tcp_init, r); From 67dbe0ae4d8f6a936da07de2f64319c0aab974df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Fri, 4 Mar 2022 14:28:08 +0100 Subject: [PATCH 3/3] Add CHANGES note for [GL #2201] --- CHANGES | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES b/CHANGES index 542742d26c..7839b51d26 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +5825. [funcf] Set the minimum MTU on UDPv6 and TCPv6 sockets and + limit TCP maximum segment size (TCP_MAXSEG) to (1220) + for both TCPv4 and TCPv6 sockets. [GL #2201] + 5824. [bug] Invalid dnssec-policy definitions were being accepted where the defined keys did not cover both KSK and ZSK roles for a given algorithm. This is now checked for