2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-09-01 23:25:38 +00:00

Merge branch '3095-invalid-recvmmsg-detection' into 'main'

Fix the UDP recvmmsg support

Closes #3095

See merge request isc-projects/bind9!5713
This commit is contained in:
Ondřej Surý
2022-01-13 18:43:00 +00:00
9 changed files with 148 additions and 74 deletions

View File

@@ -1,3 +1,6 @@
5793. [bug] Correctly detect and enable UDP recvmmsg support
in all versions of libuv that support it. [GL #3095]
5792. [bug] Don't schedule zone events on ISC_R_SHUTTINGDOWN 5792. [bug] Don't schedule zone events on ISC_R_SHUTTINGDOWN
event failures. [GL #3084] event failures. [GL #3084]

View File

@@ -551,6 +551,11 @@ AC_MSG_CHECKING([for libuv])
PKG_CHECK_MODULES([LIBUV], [libuv >= 1.0.0], [], PKG_CHECK_MODULES([LIBUV], [libuv >= 1.0.0], [],
[AC_MSG_ERROR([libuv not found])]) [AC_MSG_ERROR([libuv not found])])
# libuv recvmmsg support
AC_CHECK_DECLS([UV_UDP_RECVMMSG, UV_UDP_MMSG_FREE, UV_UDP_MMSG_CHUNK], [], [], [[#include <uv.h>]])
AS_CASE([$host],
[*-musl],[AC_DEFINE([HAVE_DECL_UV_UDP_RECVMMSG], [0], [Disable recvmmsg support on systems with MUSL glibc])])
# [pairwise: --enable-doh --with-libnghttp2=auto, --enable-doh --with-libnghttp2=yes, --disable-doh] # [pairwise: --enable-doh --with-libnghttp2=auto, --enable-doh --with-libnghttp2=yes, --disable-doh]
AC_ARG_ENABLE([doh], AC_ARG_ENABLE([doh],
[AS_HELP_STRING([--disable-doh], [enable DNS over HTTPS, requires libnghttp2 (default=yes)])], [AS_HELP_STRING([--disable-doh], [enable DNS over HTTPS, requires libnghttp2 (default=yes)])],

View File

@@ -57,3 +57,7 @@ Bug Fixes
- Using ``rndc`` on a busy recursive server could cause the ``named`` to abort - Using ``rndc`` on a busy recursive server could cause the ``named`` to abort
with assertion failure. This has been fixed. :gl:`#3079` with assertion failure. This has been fixed. :gl:`#3079`
- With libuv >= 1.37.0, the recvmmsg support would not be enabled in ``named``
reducing the maximum query-response performance. The recvmmsg support would
be used only in libuv 1.35.0 and 1.36.0. This has been fixed. :gl:`#3095`

View File

@@ -46,19 +46,51 @@
/* Must be different from ISC_NETMGR_TID_UNKNOWN */ /* Must be different from ISC_NETMGR_TID_UNKNOWN */
#define ISC_NETMGR_NON_INTERLOCKED -2 #define ISC_NETMGR_NON_INTERLOCKED -2
#define ISC_NETMGR_TLSBUF_SIZE 65536 /*
* Receive buffers
*/
#if HAVE_DECL_UV_UDP_MMSG_CHUNK
/*
* The value 20 here is UV__MMSG_MAXWIDTH taken from the current libuv source,
* libuv will not receive more that 20 datagrams in a single recvmmsg call.
*/
#define ISC_NETMGR_UDP_RECVBUF_SIZE (20 * UINT16_MAX)
#else
/*
* A single DNS message size
*/
#define ISC_NETMGR_UDP_RECVBUF_SIZE UINT16_MAX
#endif
/* /*
* New versions of libuv support recvmmsg on unices. * The TCP receive buffer can fit one maximum sized DNS message plus its size,
* Since recvbuf is only allocated per worker allocating a bigger one is not * the receive buffer here affects TCP, DoT and DoH.
* that wasteful.
* 20 here is UV__MMSG_MAXWIDTH taken from the current libuv source, nothing
* will break if the original value changes.
*/ */
#define ISC_NETMGR_RECVBUF_SIZE (20 * 65536) #define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
/* Pick the larger buffer */
#define ISC_NETMGR_RECVBUF_SIZE \
(ISC_NETMGR_UDP_RECVBUF_SIZE >= ISC_NETMGR_TCP_RECVBUF_SIZE \
? ISC_NETMGR_UDP_RECVBUF_SIZE \
: ISC_NETMGR_TCP_RECVBUF_SIZE)
/*
* Send buffer
*/
#define ISC_NETMGR_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX) #define ISC_NETMGR_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
/*
* Make sure our RECVBUF size is large enough
*/
STATIC_ASSERT(ISC_NETMGR_UDP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE,
"UDP receive buffer size must be smaller or equal than worker "
"receive buffer size");
STATIC_ASSERT(ISC_NETMGR_TCP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE,
"TCP receive buffer size must be smaller or equal than worker "
"receive buffer size");
/*% /*%
* Regular TCP buffer size. * Regular TCP buffer size.
*/ */
@@ -70,7 +102,7 @@
* most in TCPDNS or TLSDNS connections, so there's no risk of overrun * most in TCPDNS or TLSDNS connections, so there's no risk of overrun
* when using a buffer this size. * when using a buffer this size.
*/ */
#define NM_BIG_BUF (65535 + 2) * 2 #define NM_BIG_BUF ISC_NETMGR_TCP_RECVBUF_SIZE * 2
#if defined(SO_REUSEPORT_LB) || (defined(SO_REUSEPORT) && defined(__linux__)) #if defined(SO_REUSEPORT_LB) || (defined(SO_REUSEPORT) && defined(__linux__))
#define HAVE_SO_REUSEPORT_LB 1 #define HAVE_SO_REUSEPORT_LB 1

View File

@@ -1599,20 +1599,10 @@ isc__nm_free_uvbuf(isc_nmsocket_t *sock, const uv_buf_t *buf) {
isc__networker_t *worker = NULL; isc__networker_t *worker = NULL;
REQUIRE(VALID_NMSOCK(sock)); REQUIRE(VALID_NMSOCK(sock));
if (buf->base == NULL) {
/* Empty buffer: might happen in case of error. */
return;
}
worker = &sock->mgr->workers[sock->tid];
REQUIRE(worker->recvbuf_inuse); worker = &sock->mgr->workers[sock->tid];
if (sock->type == isc_nm_udpsocket && buf->base > worker->recvbuf &&
buf->base <= worker->recvbuf + ISC_NETMGR_RECVBUF_SIZE)
{
/* Can happen in case of out-of-order recvmmsg in libuv1.36 */
return;
}
REQUIRE(buf->base == worker->recvbuf); REQUIRE(buf->base == worker->recvbuf);
worker->recvbuf_inuse = false; worker->recvbuf_inuse = false;
} }
@@ -2187,7 +2177,7 @@ isc__nm_get_read_req(isc_nmsocket_t *sock, isc_sockaddr_t *sockaddr) {
} }
/*%< /*%<
* Allocator for read operations. Limited to size 2^16. * Allocator callback for read operations.
* *
* Note this doesn't actually allocate anything, it just assigns the * Note this doesn't actually allocate anything, it just assigns the
* worker's receive buffer to a socket, and marks it as "in use". * worker's receive buffer to a socket, and marks it as "in use".
@@ -2199,35 +2189,34 @@ isc__nm_alloc_cb(uv_handle_t *handle, size_t size, uv_buf_t *buf) {
REQUIRE(VALID_NMSOCK(sock)); REQUIRE(VALID_NMSOCK(sock));
REQUIRE(isc__nm_in_netthread()); REQUIRE(isc__nm_in_netthread());
/*
* The size provided by libuv is only suggested size, and it always
* defaults to 64 * 1024 in the current versions of libuv (see
* src/unix/udp.c and src/unix/stream.c).
*/
UNUSED(size);
worker = &sock->mgr->workers[sock->tid];
INSIST(!worker->recvbuf_inuse);
INSIST(worker->recvbuf != NULL);
switch (sock->type) { switch (sock->type) {
case isc_nm_udpsocket: case isc_nm_udpsocket:
REQUIRE(size <= ISC_NETMGR_RECVBUF_SIZE); buf->len = ISC_NETMGR_UDP_RECVBUF_SIZE;
size = ISC_NETMGR_RECVBUF_SIZE;
break; break;
case isc_nm_tcpsocket: case isc_nm_tcpsocket:
case isc_nm_tcpdnssocket: case isc_nm_tcpdnssocket:
break;
case isc_nm_tlsdnssocket: case isc_nm_tlsdnssocket:
/* buf->len = ISC_NETMGR_TCP_RECVBUF_SIZE;
* We need to limit the individual chunks to be read, so the
* BIO_write() will always succeed and the consumed before the
* next readcb is called.
*/
if (size >= ISC_NETMGR_TLSBUF_SIZE) {
size = ISC_NETMGR_TLSBUF_SIZE;
}
break; break;
default: default:
INSIST(0); INSIST(0);
ISC_UNREACHABLE(); ISC_UNREACHABLE();
} }
worker = &sock->mgr->workers[sock->tid]; REQUIRE(buf->len <= ISC_NETMGR_RECVBUF_SIZE);
INSIST(!worker->recvbuf_inuse || sock->type == isc_nm_udpsocket);
buf->base = worker->recvbuf; buf->base = worker->recvbuf;
buf->len = size;
worker->recvbuf_inuse = true; worker->recvbuf_inuse = true;
} }

View File

@@ -816,7 +816,6 @@ isc__nm_tcp_resumeread(isc_nmhandle_t *handle) {
isc__netievent_tcpstartread_t *ievent = NULL; isc__netievent_tcpstartread_t *ievent = NULL;
isc_nmsocket_t *sock = handle->sock; isc_nmsocket_t *sock = handle->sock;
isc__networker_t *worker = &sock->mgr->workers[sock->tid];
REQUIRE(sock->tid == isc_nm_tid()); REQUIRE(sock->tid == isc_nm_tid());
@@ -838,19 +837,9 @@ isc__nm_tcp_resumeread(isc_nmhandle_t *handle) {
ievent = isc__nm_get_netievent_tcpstartread(sock->mgr, sock); ievent = isc__nm_get_netievent_tcpstartread(sock->mgr, sock);
if (worker->recvbuf_inuse) { isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid],
/*
* If we happen to call the resumeread from inside the receive
* callback, the worker->recvbuf might still be in use, so we
* need to force enqueue the next read event.
*/
isc__nm_enqueue_ievent(worker, (isc__netievent_t *)ievent);
} else {
isc__nm_maybe_enqueue_ievent(worker,
(isc__netievent_t *)ievent); (isc__netievent_t *)ievent);
} }
}
void void
isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) {
@@ -903,6 +892,15 @@ isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) {
} }
free: free:
if (nread < 0) {
/*
* The buffer may be a null buffer on error.
*/
if (buf->base == NULL && buf->len == 0) {
return;
}
}
isc__nm_free_uvbuf(sock, buf); isc__nm_free_uvbuf(sock, buf);
} }

View File

@@ -879,6 +879,15 @@ isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread,
isc__nm_process_sock_buffer(sock); isc__nm_process_sock_buffer(sock);
free: free:
if (nread < 0) {
/*
* The buffer may be a null buffer on error.
*/
if (buf->base == NULL && buf->len == 0) {
return;
}
}
isc__nm_free_uvbuf(sock, buf); isc__nm_free_uvbuf(sock, buf);
} }

View File

@@ -264,12 +264,12 @@ tlsdns_connect_cb(uv_connect_t *uvreq, int status) {
/* /*
* *
*/ */
r = BIO_new_bio_pair(&sock->tls.ssl_wbio, ISC_NETMGR_TLSBUF_SIZE, r = BIO_new_bio_pair(&sock->tls.ssl_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE,
&sock->tls.app_rbio, ISC_NETMGR_TLSBUF_SIZE); &sock->tls.app_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE);
RUNTIME_CHECK(r == 1); RUNTIME_CHECK(r == 1);
r = BIO_new_bio_pair(&sock->tls.ssl_rbio, ISC_NETMGR_TLSBUF_SIZE, r = BIO_new_bio_pair(&sock->tls.ssl_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE,
&sock->tls.app_wbio, ISC_NETMGR_TLSBUF_SIZE); &sock->tls.app_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE);
RUNTIME_CHECK(r == 1); RUNTIME_CHECK(r == 1);
#if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO #if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO
@@ -1003,8 +1003,8 @@ tls_cycle_input(isc_nmsocket_t *sock) {
(void)SSL_peek(sock->tls.tls, &(char){ '\0' }, 0); (void)SSL_peek(sock->tls.tls, &(char){ '\0' }, 0);
int pending = SSL_pending(sock->tls.tls); int pending = SSL_pending(sock->tls.tls);
if (pending > ISC_NETMGR_TLSBUF_SIZE) { if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) {
pending = ISC_NETMGR_TLSBUF_SIZE; pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE;
} }
if ((sock->buf_len + pending) > sock->buf_size) { if ((sock->buf_len + pending) > sock->buf_size) {
@@ -1194,8 +1194,8 @@ tls_cycle_output(isc_nmsocket_t *sock) {
break; break;
} }
if (pending > ISC_NETMGR_TLSBUF_SIZE) { if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) {
pending = ISC_NETMGR_TLSBUF_SIZE; pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE;
} }
sock->tls.senddata.base = isc_mem_get(sock->mgr->mctx, pending); sock->tls.senddata.base = isc_mem_get(sock->mgr->mctx, pending);
@@ -1381,6 +1381,16 @@ isc__nm_tlsdns_read_cb(uv_stream_t *stream, ssize_t nread,
} }
free: free:
async_tlsdns_cycle(sock); async_tlsdns_cycle(sock);
if (nread < 0) {
/*
* The buffer may be a null buffer on error.
*/
if (buf->base == NULL && buf->len == 0) {
return;
}
}
isc__nm_free_uvbuf(sock, buf); isc__nm_free_uvbuf(sock, buf);
} }
@@ -1516,12 +1526,12 @@ accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota) {
csock->tls.tls = isc_tls_create(ssock->tls.ctx); csock->tls.tls = isc_tls_create(ssock->tls.ctx);
RUNTIME_CHECK(csock->tls.tls != NULL); RUNTIME_CHECK(csock->tls.tls != NULL);
r = BIO_new_bio_pair(&csock->tls.ssl_wbio, ISC_NETMGR_TLSBUF_SIZE, r = BIO_new_bio_pair(&csock->tls.ssl_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE,
&csock->tls.app_rbio, ISC_NETMGR_TLSBUF_SIZE); &csock->tls.app_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE);
RUNTIME_CHECK(r == 1); RUNTIME_CHECK(r == 1);
r = BIO_new_bio_pair(&csock->tls.ssl_rbio, ISC_NETMGR_TLSBUF_SIZE, r = BIO_new_bio_pair(&csock->tls.ssl_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE,
&csock->tls.app_wbio, ISC_NETMGR_TLSBUF_SIZE); &csock->tls.app_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE);
RUNTIME_CHECK(r == 1); RUNTIME_CHECK(r == 1);
#if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO #if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO

View File

@@ -431,7 +431,7 @@ isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0) {
REQUIRE(sock->parent != NULL); REQUIRE(sock->parent != NULL);
REQUIRE(sock->tid == isc_nm_tid()); REQUIRE(sock->tid == isc_nm_tid());
#ifdef UV_UDP_RECVMMSG #if HAVE_DECL_UV_UDP_RECVMMSG
uv_init_flags |= UV_UDP_RECVMMSG; uv_init_flags |= UV_UDP_RECVMMSG;
#endif #endif
r = uv_udp_init_ex(&worker->loop, &sock->uv_handle.udp, uv_init_flags); r = uv_udp_init_ex(&worker->loop, &sock->uv_handle.udp, uv_init_flags);
@@ -556,7 +556,6 @@ udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf,
isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)handle); isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)handle);
isc__nm_uvreq_t *req = NULL; isc__nm_uvreq_t *req = NULL;
uint32_t maxudp; uint32_t maxudp;
bool free_buf;
isc_result_t result; isc_result_t result;
isc_sockaddr_t sockaddr, *sa = NULL; isc_sockaddr_t sockaddr, *sa = NULL;
@@ -564,19 +563,22 @@ udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf,
REQUIRE(sock->tid == isc_nm_tid()); REQUIRE(sock->tid == isc_nm_tid());
REQUIRE(atomic_load(&sock->reading)); REQUIRE(atomic_load(&sock->reading));
#ifdef UV_UDP_MMSG_FREE /*
free_buf = ((flags & UV_UDP_MMSG_FREE) == UV_UDP_MMSG_FREE); * When using recvmmsg(2), if no errors occur, there will be a final
#elif UV_UDP_MMSG_CHUNK * callback with nrecv set to 0, addr set to NULL and the buffer
free_buf = ((flags & UV_UDP_MMSG_CHUNK) == 0); * pointing at the initially allocated data with the UV_UDP_MMSG_CHUNK
* flag cleared and the UV_UDP_MMSG_FREE flag set.
*/
#if HAVE_DECL_UV_UDP_MMSG_FREE
if ((flags & UV_UDP_MMSG_FREE) == UV_UDP_MMSG_FREE) {
INSIST(nrecv == 0);
INSIST(addr == NULL);
goto free;
}
#else #else
free_buf = true;
UNUSED(flags); UNUSED(flags);
#endif #endif
/*
* Four possible reasons to return now without processing:
*/
/* /*
* - If we're simulating a firewall blocking UDP packets * - If we're simulating a firewall blocking UDP packets
* bigger than 'maxudp' bytes for testing purposes. * bigger than 'maxudp' bytes for testing purposes.
@@ -640,9 +642,31 @@ udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf,
sock->processing = false; sock->processing = false;
free: free:
if (free_buf) { #if HAVE_DECL_UV_UDP_MMSG_CHUNK
isc__nm_free_uvbuf(sock, buf); /*
* When using recvmmsg(2), chunks will have the UV_UDP_MMSG_CHUNK flag
* set, those must not be freed.
*/
if ((flags & UV_UDP_MMSG_CHUNK) == UV_UDP_MMSG_CHUNK) {
return;
} }
#endif
/*
* When using recvmmsg(2), if a UDP socket error occurs, nrecv will be <
* 0. In either scenario, the callee can now safely free the provided
* buffer.
*/
if (nrecv < 0) {
/*
* The buffer may be a null buffer on error.
*/
if (buf->base == NULL && buf->len == 0) {
return;
}
}
isc__nm_free_uvbuf(sock, buf);
} }
/* /*