diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index 1c30c6c15b..ceeb5cbb27 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -834,7 +834,31 @@ isc__nm_decstats(isc_nm_t *mgr, isc_statscounter_t counterid); */ isc_result_t -isc__nm_socket_freebind(const uv_handle_t *handle); +isc__nm_socket_freebind(uv_os_sock_t fd, sa_family_t sa_family); /*%< * Set the IP_FREEBIND (or equivalent) socket option on the uv_handle */ + +isc_result_t +isc__nm_socket_reuse(uv_os_sock_t fd); +/*%< + * Set the SO_REUSEADDR or SO_REUSEPORT (or equivalent) socket option on the fd + */ + +isc_result_t +isc__nm_socket_reuse_lb(uv_os_sock_t fd); +/*%< + * Set the SO_REUSEPORT_LB (or equivalent) socket option on the fd + */ + +isc_result_t +isc__nm_socket_incoming_cpu(uv_os_sock_t fd); +/*%< + * Set the SO_INCOMING_CPU socket option on the fd if available + */ + +isc_result_t +isc__nm_socket_dontfrag(uv_os_sock_t fd, sa_family_t sa_family); +/*%< + * Set the SO_IP_DONTFRAG (or equivalent) socket option of the fd if available + */ diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index d4e2bf29f5..28dc6f22e1 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -1584,51 +1584,169 @@ isc__nm_decstats(isc_nm_t *mgr, isc_statscounter_t counterid) { setsockopt(socket, level, name, &(int){ 1 }, sizeof(int)) isc_result_t -isc__nm_socket_freebind(const uv_handle_t *handle) { +isc__nm_socket_freebind(uv_os_sock_t fd, sa_family_t sa_family) { /* * Set the IP_FREEBIND (or equivalent option) on the uv_handle. */ - isc_result_t result = ISC_R_SUCCESS; - uv_os_fd_t fd; - if (uv_fileno(handle, &fd) != 0) { - return (ISC_R_FAILURE); - } #ifdef IP_FREEBIND + UNUSED(sa_family); if (setsockopt_on(fd, IPPROTO_IP, IP_FREEBIND) == -1) { return (ISC_R_FAILURE); } + return (ISC_R_SUCCESS); #elif defined(IP_BINDANY) || defined(IPV6_BINDANY) - struct sockaddr_in sockfd; - - if (getsockname(fd, (struct sockaddr *)&sockfd, - &(socklen_t){ sizeof(sockfd) }) == -1) - { - return (ISC_R_FAILURE); - } + if (sa_family == AF_INET) { #if defined(IP_BINDANY) - if (sockfd.sin_family == AF_INET) { if (setsockopt_on(fd, IPPROTO_IP, IP_BINDANY) == -1) { return (ISC_R_FAILURE); } - } + return (ISC_R_SUCCESS); #endif + } else if (sa_family == AF_INET6) { #if defined(IPV6_BINDANY) - if (sockfd.sin_family == AF_INET6) { if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_BINDANY) == -1) { return (ISC_R_FAILURE); } - } + return (ISC_R_SUCCESS); #endif + } + return (ISC_R_NOTIMPLEMENTED); #elif defined(SO_BINDANY) + UNUSED(sa_family); if (setsockopt_on(fd, SOL_SOCKET, SO_BINDANY) == -1) { return (ISC_R_FAILURE); } + return (ISC_R_SUCCESS); #else - UNUSED(handle); UNUSED(fd); - result = ISC_R_NOTIMPLEMENTED; + UNUSED(sa_family); + return (ISC_R_NOTIMPLEMENTED); #endif - return (result); +} + +isc_result_t +isc__nm_socket_reuse(uv_os_sock_t fd) { + /* + * Generally, the SO_REUSEADDR socket option allows reuse of + * local addresses. + * + * On the BSDs, SO_REUSEPORT implies SO_REUSEADDR but with some + * additional refinements for programs that use multicast. + * + * On Linux, SO_REUSEPORT has different semantics: it _shares_ the port + * rather than steal it from the current listener, so we don't use it + * here, but rather in isc__nm_socket_reuse_lb(). + * + * On Windows, it also allows a socket to forcibly bind to a port in use + * by another socket. + */ + +#if defined(SO_REUSEPORT) && !defined(__linux__) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#elif defined(SO_REUSEADDR) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEADDR) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#else + UNUSED(fd); + return (ISC_R_NOTIMPLEMENTED); +#endif +} + +isc_result_t +isc__nm_socket_reuse_lb(uv_os_sock_t fd) { + /* + * On FreeBSD 12+, SO_REUSEPORT_LB socket option allows sockets to be + * bound to an identical socket address. For UDP sockets, the use of + * this option can provide better distribution of incoming datagrams to + * multiple processes (or threads) as compared to the traditional + * technique of having multiple processes compete to receive datagrams + * on the same socket. + * + * On Linux, the same thing is achieved simply with SO_REUSEPORT. + */ +#if defined(SO_REUSEPORT_LB) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT_LB) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#elif defined(SO_REUSEPORT) && defined(__linux__) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); + return (ISC_R_NOTIMPLEMENTED); +#endif +} + +isc_result_t +isc__nm_socket_incoming_cpu(uv_os_sock_t fd) { +#ifdef SO_INCOMING_CPU + if (setsockopt_on(fd, SOL_SOCKET, SO_INCOMING_CPU) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); +#endif + return (ISC_R_NOTIMPLEMENTED); +} + +isc_result_t +isc__nm_socket_dontfrag(uv_os_sock_t fd, sa_family_t sa_family) { + /* + * Set the Don't Fragment flag on IP packets + */ + if (sa_family == AF_INET6) { +#if defined(IPV6_DONTFRAG) + if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_DONTFRAG) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#elif defined(IPV6_MTU_DISCOVER) + if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER, + &(int){ IP_PMTUDISC_DO }, sizeof(int)) == -1) + { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); +#endif + } else if (sa_family == AF_INET) { +#if defined(IP_DONTFRAG) + if (setsockopt_on(fd, IPPROTO_IP, IP_DONTFRAG) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#elif defined(IP_MTU_DISCOVER) + if (setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER, + &(int){ IP_PMTUDISC_DO }, sizeof(int)) == -1) + { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); +#endif + } else { + return (ISC_R_FAMILYNOSUPPORT); + } + + return (ISC_R_NOTIMPLEMENTED); } #ifdef NETMGR_TRACE diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index fbfc5aebce..5f731ddfee 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -318,6 +318,8 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) { isc_nmsocket_t *sock = ievent->sock; struct sockaddr_storage sname; int r, flags = 0, snamelen = sizeof(sname); + sa_family_t sa_family; + uv_os_sock_t fd; REQUIRE(isc__nm_in_netthread()); REQUIRE(sock->type == isc_nm_tcplistener); @@ -334,14 +336,16 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) { isc__nm_incstats(sock->mgr, sock->statsindex[STATID_OPEN]); - if (sock->iface->addr.type.sa.sa_family == AF_INET6) { + sa_family = sock->iface->addr.type.sa.sa_family; + if (sa_family == AF_INET6) { flags = UV_TCP_IPV6ONLY; } r = uv_tcp_bind(&sock->uv_handle.tcp, &sock->iface->addr.type.sa, flags); if (r == UV_EADDRNOTAVAIL && - isc__nm_socket_freebind(&sock->uv_handle.handle) == ISC_R_SUCCESS) + uv_fileno(&sock->uv_handle.handle, (uv_os_fd_t *)&fd) == 0 && + isc__nm_socket_freebind(fd, sa_family) == ISC_R_SUCCESS) { /* * Retry binding with IP_FREEBIND (or equivalent option) if the diff --git a/lib/isc/netmgr/udp.c b/lib/isc/netmgr/udp.c index b575d80aff..7056a29bf2 100644 --- a/lib/isc/netmgr/udp.c +++ b/lib/isc/netmgr/udp.c @@ -65,8 +65,8 @@ isc_nm_listenudp(isc_nm_t *mgr, isc_nmiface_t *iface, isc_nm_recv_cb_t cb, nsock->extrahandlesize = extrahandlesize; for (size_t i = 0; i < mgr->nworkers; i++) { + isc_result_t result; uint16_t family = iface->addr.type.sa.sa_family; - int res = 0; isc__netievent_udplisten_t *ievent = NULL; isc_nmsocket_t *csock = &nsock->children[i]; @@ -82,46 +82,20 @@ isc_nm_listenudp(isc_nm_t *mgr, isc_nmiface_t *iface, isc_nm_recv_cb_t cb, csock->fd = socket(family, SOCK_DGRAM, 0); RUNTIME_CHECK(csock->fd >= 0); - /* - * This is SO_REUSE**** hell: - * - * Generally, the SO_REUSEADDR socket option allows reuse of - * local addresses. On Windows, it also allows a socket to - * forcibly bind to a port in use by another socket. - * - * On Linux, SO_REUSEPORT socket option allows sockets to be - * bound to an identical socket address. For UDP sockets, the - * use of this option can provide better distribution of - * incoming datagrams to multiple processes (or threads) as - * compared to the traditional technique of having multiple - * processes compete to receive datagrams on the same socket. - * - * On FreeBSD, the same thing is achieved with SO_REUSEPORT_LB. - * - */ -#if defined(SO_REUSEADDR) - res = setsockopt(csock->fd, SOL_SOCKET, SO_REUSEADDR, - &(int){ 1 }, sizeof(int)); - RUNTIME_CHECK(res == 0); -#endif -#if defined(SO_REUSEPORT_LB) - res = setsockopt(csock->fd, SOL_SOCKET, SO_REUSEPORT_LB, - &(int){ 1 }, sizeof(int)); - RUNTIME_CHECK(res == 0); -#elif defined(SO_REUSEPORT) - res = setsockopt(csock->fd, SOL_SOCKET, SO_REUSEPORT, - &(int){ 1 }, sizeof(int)); - RUNTIME_CHECK(res == 0); -#endif + result = isc__nm_socket_reuse(csock->fd); + RUNTIME_CHECK(result == ISC_R_SUCCESS || + result == ISC_R_NOTIMPLEMENTED); + + result = isc__nm_socket_reuse_lb(csock->fd); + RUNTIME_CHECK(result == ISC_R_SUCCESS || + result == ISC_R_NOTIMPLEMENTED); -#ifdef SO_INCOMING_CPU /* We don't check for the result, because SO_INCOMING_CPU can be * available without the setter on Linux kernel version 4.4, and * setting SO_INCOMING_CPU is just an optimization. */ - (void)setsockopt(csock->fd, SOL_SOCKET, SO_INCOMING_CPU, - &(int){ 1 }, sizeof(int)); -#endif + (void)isc__nm_socket_incoming_cpu(csock->fd); + ievent = isc__nm_get_ievent(mgr, netievent_udplisten); ievent->sock = csock; isc__nm_enqueue_ievent(&mgr->workers[i], @@ -167,6 +141,7 @@ isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0) { isc_nmsocket_t *sock = ievent->sock; int r, uv_bind_flags = 0; int uv_init_flags = 0; + sa_family_t sa_family; REQUIRE(sock->type == isc_nm_udpsocket); REQUIRE(sock->iface != NULL); @@ -188,14 +163,15 @@ isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0) { isc__nm_incstats(sock->mgr, sock->statsindex[STATID_OPENFAIL]); } - if (sock->iface->addr.type.sa.sa_family == AF_INET6) { + sa_family = sock->iface->addr.type.sa.sa_family; + if (sa_family == AF_INET6) { uv_bind_flags |= UV_UDP_IPV6ONLY; } r = uv_udp_bind(&sock->uv_handle.udp, &sock->parent->iface->addr.type.sa, uv_bind_flags); if (r == UV_EADDRNOTAVAIL && - isc__nm_socket_freebind(&sock->uv_handle.handle) == ISC_R_SUCCESS) + isc__nm_socket_freebind(sock->fd, sa_family) == ISC_R_SUCCESS) { /* * Retry binding with IP_FREEBIND (or equivalent option) if the diff --git a/lib/isc/win32/include/isc/platform.h.in b/lib/isc/win32/include/isc/platform.h.in index d51197ff2a..a8b645007c 100644 --- a/lib/isc/win32/include/isc/platform.h.in +++ b/lib/isc/win32/include/isc/platform.h.in @@ -56,6 +56,8 @@ typedef uint32_t socklen_t; #undef MSG_TRUNC +typedef uint16_t sa_family_t; + /* * Set up a macro for importing and exporting from the DLL */