diff --git a/configure.ac b/configure.ac index 580095ef75..ce8117b15d 100644 --- a/configure.ac +++ b/configure.ac @@ -551,6 +551,9 @@ AC_MSG_CHECKING([for libuv]) PKG_CHECK_MODULES([LIBUV], [libuv >= 1.0.0], [], [AC_MSG_ERROR([libuv not found])]) +# libuv recvmmsg support +AC_CHECK_DECLS([UV_UDP_RECVMMSG, UV_UDP_MMSG_FREE, UV_UDP_MMSG_CHUNK], [], [], [[#include ]]) + # [pairwise: --enable-doh --with-libnghttp2=auto, --enable-doh --with-libnghttp2=yes, --disable-doh] AC_ARG_ENABLE([doh], [AS_HELP_STRING([--disable-doh], [enable DNS over HTTPS, requires libnghttp2 (default=yes)])], diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index d368dcafac..0b93c22903 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -46,19 +46,51 @@ /* Must be different from ISC_NETMGR_TID_UNKNOWN */ #define ISC_NETMGR_NON_INTERLOCKED -2 -#define ISC_NETMGR_TLSBUF_SIZE 65536 +/* + * Receive buffers + */ +#if HAVE_DECL_UV_UDP_MMSG_CHUNK +/* + * The value 20 here is UV__MMSG_MAXWIDTH taken from the current libuv source, + * libuv will not receive more that 20 datagrams in a single recvmmsg call. + */ +#define ISC_NETMGR_UDP_RECVBUF_SIZE (20 * UINT16_MAX) +#else +/* + * A single DNS message size + */ +#define ISC_NETMGR_UDP_RECVBUF_SIZE UINT16_MAX +#endif /* - * New versions of libuv support recvmmsg on unices. - * Since recvbuf is only allocated per worker allocating a bigger one is not - * that wasteful. - * 20 here is UV__MMSG_MAXWIDTH taken from the current libuv source, nothing - * will break if the original value changes. + * The TCP receive buffer can fit one maximum sized DNS message plus its size, + * the receive buffer here affects TCP, DoT and DoH. */ -#define ISC_NETMGR_RECVBUF_SIZE (20 * 65536) +#define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX) +/* Pick the larger buffer */ +#define ISC_NETMGR_RECVBUF_SIZE \ + (ISC_NETMGR_UDP_RECVBUF_SIZE >= ISC_NETMGR_TCP_RECVBUF_SIZE \ + ? ISC_NETMGR_UDP_RECVBUF_SIZE \ + : ISC_NETMGR_TCP_RECVBUF_SIZE) + +/* + * Send buffer + */ #define ISC_NETMGR_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX) +/* + * Make sure our RECVBUF size is large enough + */ + +STATIC_ASSERT(ISC_NETMGR_UDP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE, + "UDP receive buffer size must be smaller or equal than worker " + "receive buffer size"); + +STATIC_ASSERT(ISC_NETMGR_TCP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE, + "TCP receive buffer size must be smaller or equal than worker " + "receive buffer size"); + /*% * Regular TCP buffer size. */ @@ -70,7 +102,7 @@ * most in TCPDNS or TLSDNS connections, so there's no risk of overrun * when using a buffer this size. */ -#define NM_BIG_BUF (65535 + 2) * 2 +#define NM_BIG_BUF ISC_NETMGR_TCP_RECVBUF_SIZE * 2 #if defined(SO_REUSEPORT_LB) || (defined(SO_REUSEPORT) && defined(__linux__)) #define HAVE_SO_REUSEPORT_LB 1 diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index e68ee9f368..0af8b1044d 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -1599,20 +1599,10 @@ isc__nm_free_uvbuf(isc_nmsocket_t *sock, const uv_buf_t *buf) { isc__networker_t *worker = NULL; REQUIRE(VALID_NMSOCK(sock)); - if (buf->base == NULL) { - /* Empty buffer: might happen in case of error. */ - return; - } - worker = &sock->mgr->workers[sock->tid]; - REQUIRE(worker->recvbuf_inuse); - if (sock->type == isc_nm_udpsocket && buf->base > worker->recvbuf && - buf->base <= worker->recvbuf + ISC_NETMGR_RECVBUF_SIZE) - { - /* Can happen in case of out-of-order recvmmsg in libuv1.36 */ - return; - } + worker = &sock->mgr->workers[sock->tid]; REQUIRE(buf->base == worker->recvbuf); + worker->recvbuf_inuse = false; } @@ -2187,7 +2177,7 @@ isc__nm_get_read_req(isc_nmsocket_t *sock, isc_sockaddr_t *sockaddr) { } /*%< - * Allocator for read operations. Limited to size 2^16. + * Allocator callback for read operations. * * Note this doesn't actually allocate anything, it just assigns the * worker's receive buffer to a socket, and marks it as "in use". @@ -2199,35 +2189,34 @@ isc__nm_alloc_cb(uv_handle_t *handle, size_t size, uv_buf_t *buf) { REQUIRE(VALID_NMSOCK(sock)); REQUIRE(isc__nm_in_netthread()); + /* + * The size provided by libuv is only suggested size, and it always + * defaults to 64 * 1024 in the current versions of libuv (see + * src/unix/udp.c and src/unix/stream.c). + */ + UNUSED(size); + + worker = &sock->mgr->workers[sock->tid]; + INSIST(!worker->recvbuf_inuse); + INSIST(worker->recvbuf != NULL); switch (sock->type) { case isc_nm_udpsocket: - REQUIRE(size <= ISC_NETMGR_RECVBUF_SIZE); - size = ISC_NETMGR_RECVBUF_SIZE; + buf->len = ISC_NETMGR_UDP_RECVBUF_SIZE; break; case isc_nm_tcpsocket: case isc_nm_tcpdnssocket: - break; case isc_nm_tlsdnssocket: - /* - * We need to limit the individual chunks to be read, so the - * BIO_write() will always succeed and the consumed before the - * next readcb is called. - */ - if (size >= ISC_NETMGR_TLSBUF_SIZE) { - size = ISC_NETMGR_TLSBUF_SIZE; - } + buf->len = ISC_NETMGR_TCP_RECVBUF_SIZE; break; default: INSIST(0); ISC_UNREACHABLE(); } - worker = &sock->mgr->workers[sock->tid]; - INSIST(!worker->recvbuf_inuse || sock->type == isc_nm_udpsocket); - + REQUIRE(buf->len <= ISC_NETMGR_RECVBUF_SIZE); buf->base = worker->recvbuf; - buf->len = size; + worker->recvbuf_inuse = true; } diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index 012a9643fd..186bb83cf4 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -903,6 +903,15 @@ isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { } free: + if (nread < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + isc__nm_free_uvbuf(sock, buf); } diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c index c600982e52..6c840634f4 100644 --- a/lib/isc/netmgr/tcpdns.c +++ b/lib/isc/netmgr/tcpdns.c @@ -879,6 +879,15 @@ isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread, isc__nm_process_sock_buffer(sock); free: + if (nread < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + isc__nm_free_uvbuf(sock, buf); } diff --git a/lib/isc/netmgr/tlsdns.c b/lib/isc/netmgr/tlsdns.c index 830e464189..385a18aace 100644 --- a/lib/isc/netmgr/tlsdns.c +++ b/lib/isc/netmgr/tlsdns.c @@ -264,12 +264,12 @@ tlsdns_connect_cb(uv_connect_t *uvreq, int status) { /* * */ - r = BIO_new_bio_pair(&sock->tls.ssl_wbio, ISC_NETMGR_TLSBUF_SIZE, - &sock->tls.app_rbio, ISC_NETMGR_TLSBUF_SIZE); + r = BIO_new_bio_pair(&sock->tls.ssl_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &sock->tls.app_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE); RUNTIME_CHECK(r == 1); - r = BIO_new_bio_pair(&sock->tls.ssl_rbio, ISC_NETMGR_TLSBUF_SIZE, - &sock->tls.app_wbio, ISC_NETMGR_TLSBUF_SIZE); + r = BIO_new_bio_pair(&sock->tls.ssl_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &sock->tls.app_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE); RUNTIME_CHECK(r == 1); #if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO @@ -1003,8 +1003,8 @@ tls_cycle_input(isc_nmsocket_t *sock) { (void)SSL_peek(sock->tls.tls, &(char){ '\0' }, 0); int pending = SSL_pending(sock->tls.tls); - if (pending > ISC_NETMGR_TLSBUF_SIZE) { - pending = ISC_NETMGR_TLSBUF_SIZE; + if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) { + pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE; } if ((sock->buf_len + pending) > sock->buf_size) { @@ -1194,8 +1194,8 @@ tls_cycle_output(isc_nmsocket_t *sock) { break; } - if (pending > ISC_NETMGR_TLSBUF_SIZE) { - pending = ISC_NETMGR_TLSBUF_SIZE; + if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) { + pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE; } sock->tls.senddata.base = isc_mem_get(sock->mgr->mctx, pending); @@ -1381,6 +1381,16 @@ isc__nm_tlsdns_read_cb(uv_stream_t *stream, ssize_t nread, } free: async_tlsdns_cycle(sock); + + if (nread < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + isc__nm_free_uvbuf(sock, buf); } @@ -1516,12 +1526,12 @@ accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota) { csock->tls.tls = isc_tls_create(ssock->tls.ctx); RUNTIME_CHECK(csock->tls.tls != NULL); - r = BIO_new_bio_pair(&csock->tls.ssl_wbio, ISC_NETMGR_TLSBUF_SIZE, - &csock->tls.app_rbio, ISC_NETMGR_TLSBUF_SIZE); + r = BIO_new_bio_pair(&csock->tls.ssl_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &csock->tls.app_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE); RUNTIME_CHECK(r == 1); - r = BIO_new_bio_pair(&csock->tls.ssl_rbio, ISC_NETMGR_TLSBUF_SIZE, - &csock->tls.app_wbio, ISC_NETMGR_TLSBUF_SIZE); + r = BIO_new_bio_pair(&csock->tls.ssl_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &csock->tls.app_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE); RUNTIME_CHECK(r == 1); #if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO diff --git a/lib/isc/netmgr/udp.c b/lib/isc/netmgr/udp.c index 8d0f190040..4cb3fdc429 100644 --- a/lib/isc/netmgr/udp.c +++ b/lib/isc/netmgr/udp.c @@ -431,7 +431,7 @@ isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0) { REQUIRE(sock->parent != NULL); REQUIRE(sock->tid == isc_nm_tid()); -#ifdef UV_UDP_RECVMMSG +#if HAVE_DECL_UV_UDP_RECVMMSG uv_init_flags |= UV_UDP_RECVMMSG; #endif r = uv_udp_init_ex(&worker->loop, &sock->uv_handle.udp, uv_init_flags); @@ -556,7 +556,6 @@ udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf, isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)handle); isc__nm_uvreq_t *req = NULL; uint32_t maxudp; - bool free_buf; isc_result_t result; isc_sockaddr_t sockaddr, *sa = NULL; @@ -564,19 +563,22 @@ udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf, REQUIRE(sock->tid == isc_nm_tid()); REQUIRE(atomic_load(&sock->reading)); -#ifdef UV_UDP_MMSG_FREE - free_buf = ((flags & UV_UDP_MMSG_FREE) == UV_UDP_MMSG_FREE); -#elif UV_UDP_MMSG_CHUNK - free_buf = ((flags & UV_UDP_MMSG_CHUNK) == 0); + /* + * When using recvmmsg(2), if no errors occur, there will be a final + * callback with nrecv set to 0, addr set to NULL and the buffer + * pointing at the initially allocated data with the UV_UDP_MMSG_CHUNK + * flag cleared and the UV_UDP_MMSG_FREE flag set. + */ +#if HAVE_DECL_UV_UDP_MMSG_FREE + if ((flags & UV_UDP_MMSG_FREE) == UV_UDP_MMSG_FREE) { + INSIST(nrecv == 0); + INSIST(addr == NULL); + goto free; + } #else - free_buf = true; UNUSED(flags); #endif - /* - * Four possible reasons to return now without processing: - */ - /* * - If we're simulating a firewall blocking UDP packets * bigger than 'maxudp' bytes for testing purposes. @@ -640,9 +642,31 @@ udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf, sock->processing = false; free: - if (free_buf) { - isc__nm_free_uvbuf(sock, buf); +#if HAVE_DECL_UV_UDP_MMSG_CHUNK + /* + * When using recvmmsg(2), chunks will have the UV_UDP_MMSG_CHUNK flag + * set, those must not be freed. + */ + if ((flags & UV_UDP_MMSG_CHUNK) == UV_UDP_MMSG_CHUNK) { + return; } +#endif + + /* + * When using recvmmsg(2), if a UDP socket error occurs, nrecv will be < + * 0. In either scenario, the callee can now safely free the provided + * buffer. + */ + if (nrecv < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + + isc__nm_free_uvbuf(sock, buf); } /*