2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-09-02 07:35:26 +00:00

Redesigned TCP accepting: one listen/accept loop, passing the connected socket.

Instead of using bind() and passing the listening socket to the children
threads using uv_export/uv_import use one thread that does the accepting,
and then passes the connected socket using uv_export/uv_import to a random
worker. The previous solution had thundering herd problems (all workers
waking up on one connection and trying to accept()), this one avoids this
and is simpler.
The tcp clients quota is simplified with isc_quota_attach_cb - a callback
is issued when the quota is available.
This commit is contained in:
Witold Kręcicki
2020-03-24 13:38:51 +01:00
parent 6ace801ddf
commit 60629e5b0b
4 changed files with 278 additions and 317 deletions

View File

@@ -120,11 +120,13 @@ isc_quota_attach_cb(isc_quota_t *quota, isc_quota_t **p, isc_quota_cb_t *cb);
* *
* Like isc_quota_attach(), but if there's no quota left then cb->cb_func will * Like isc_quota_attach(), but if there's no quota left then cb->cb_func will
* be called when we are attached to quota. * be called when we are attached to quota.
* Note: It's the callee responsibility to make sure that we don't end up with *
* extremely huge number of callbacks waiting - making it easy to create a * Note: It's the caller's responsibility to make sure that we don't end up
* resource exhaustion attack. For example in case of TCP listening we simply * with a huge number of callbacks waiting, making it easy to create a
* don't accept new connections - so the number of callbacks waiting in the * resource exhaustion attack. For example, in the case of TCP listening,
* queue is limited by listen() backlog. * we simply don't accept new connections when the quota is exceeded, so
* the number of callbacks waiting in the queue will be limited by the
* listen() backlog.
* *
* Returns: * Returns:
* \li #ISC_R_SUCCESS Success * \li #ISC_R_SUCCESS Success

View File

@@ -22,6 +22,7 @@
#include <isc/mem.h> #include <isc/mem.h>
#include <isc/netmgr.h> #include <isc/netmgr.h>
#include <isc/queue.h> #include <isc/queue.h>
#include <isc/quota.h>
#include <isc/random.h> #include <isc/random.h>
#include <isc/refcount.h> #include <isc/refcount.h>
#include <isc/region.h> #include <isc/region.h>
@@ -131,8 +132,8 @@ typedef enum isc__netievent_type {
netievent_tcprecv, netievent_tcprecv,
netievent_tcpstartread, netievent_tcpstartread,
netievent_tcppauseread, netievent_tcppauseread,
netievent_tcpchildlisten, netievent_tcpchildaccept,
netievent_tcpchildstop, netievent_tcpaccept,
netievent_tcpstop, netievent_tcpstop,
netievent_tcpclose, netievent_tcpclose,
netievent_tcpdnsclose, netievent_tcpdnsclose,
@@ -211,7 +212,6 @@ typedef struct isc__netievent__socket {
typedef isc__netievent__socket_t isc__netievent_udplisten_t; typedef isc__netievent__socket_t isc__netievent_udplisten_t;
typedef isc__netievent__socket_t isc__netievent_udpstop_t; typedef isc__netievent__socket_t isc__netievent_udpstop_t;
typedef isc__netievent__socket_t isc__netievent_tcpstop_t; typedef isc__netievent__socket_t isc__netievent_tcpstop_t;
typedef isc__netievent__socket_t isc__netievent_tcpchildstop_t;
typedef isc__netievent__socket_t isc__netievent_tcpclose_t; typedef isc__netievent__socket_t isc__netievent_tcpclose_t;
typedef isc__netievent__socket_t isc__netievent_tcpdnsclose_t; typedef isc__netievent__socket_t isc__netievent_tcpdnsclose_t;
typedef isc__netievent__socket_t isc__netievent_startread_t; typedef isc__netievent__socket_t isc__netievent_startread_t;
@@ -228,13 +228,15 @@ typedef isc__netievent__socket_req_t isc__netievent_tcpconnect_t;
typedef isc__netievent__socket_req_t isc__netievent_tcplisten_t; typedef isc__netievent__socket_req_t isc__netievent_tcplisten_t;
typedef isc__netievent__socket_req_t isc__netievent_tcpsend_t; typedef isc__netievent__socket_req_t isc__netievent_tcpsend_t;
typedef struct isc__netievent__socket_streaminfo { typedef struct isc__netievent__socket_streaminfo_quota {
isc__netievent_type type; isc__netievent_type type;
isc_nmsocket_t *sock; isc_nmsocket_t *sock;
isc_uv_stream_info_t streaminfo; isc_uv_stream_info_t streaminfo;
} isc__netievent__socket_streaminfo_t; isc_quota_t *quota;
} isc__netievent__socket_streaminfo_quota_t;
typedef isc__netievent__socket_streaminfo_t isc__netievent_tcpchildlisten_t; typedef isc__netievent__socket_streaminfo_quota_t
isc__netievent_tcpchildaccept_t;
typedef struct isc__netievent__socket_handle { typedef struct isc__netievent__socket_handle {
isc__netievent_type type; isc__netievent_type type;
@@ -242,6 +244,14 @@ typedef struct isc__netievent__socket_handle {
isc_nmhandle_t *handle; isc_nmhandle_t *handle;
} isc__netievent__socket_handle_t; } isc__netievent__socket_handle_t;
typedef struct isc__netievent__socket_quota {
isc__netievent_type type;
isc_nmsocket_t *sock;
isc_quota_t *quota;
} isc__netievent__socket_quota_t;
typedef isc__netievent__socket_quota_t isc__netievent_tcpaccept_t;
typedef struct isc__netievent_udpsend { typedef struct isc__netievent_udpsend {
isc__netievent_type type; isc__netievent_type type;
isc_nmsocket_t *sock; isc_nmsocket_t *sock;
@@ -261,7 +271,8 @@ typedef union {
isc__netievent__socket_t nis; isc__netievent__socket_t nis;
isc__netievent__socket_req_t nisr; isc__netievent__socket_req_t nisr;
isc__netievent_udpsend_t nius; isc__netievent_udpsend_t nius;
isc__netievent__socket_streaminfo_t niss; isc__netievent__socket_quota_t nisq;
isc__netievent__socket_streaminfo_quota_t nissq;
} isc__netievent_storage_t; } isc__netievent_storage_t;
/* /*
@@ -324,7 +335,6 @@ typedef enum isc_nmsocket_type {
isc_nm_udplistener, /* Aggregate of nm_udpsocks */ isc_nm_udplistener, /* Aggregate of nm_udpsocks */
isc_nm_tcpsocket, isc_nm_tcpsocket,
isc_nm_tcplistener, isc_nm_tcplistener,
isc_nm_tcpchildlistener,
isc_nm_tcpdnslistener, isc_nm_tcpdnslistener,
isc_nm_tcpdnssocket isc_nm_tcpdnssocket
} isc_nmsocket_type; } isc_nmsocket_type;
@@ -370,16 +380,7 @@ struct isc_nmsocket {
*/ */
isc_quota_t *quota; isc_quota_t *quota;
isc_quota_t *pquota; isc_quota_t *pquota;
isc_quota_cb_t quotacb;
/*%
* How many connections we have not accepted due to quota?
* When we close a connection we need to accept a new one.
*/
int overquota;
/*%
* How many active connections we have?
*/
int conns;
/*% /*%
* Socket statistics * Socket statistics
@@ -704,12 +705,12 @@ isc__nm_async_tcpconnect(isc__networker_t *worker, isc__netievent_t *ev0);
void void
isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0); isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0);
void void
isc__nm_async_tcpchildlisten(isc__networker_t *worker, isc__netievent_t *ev0); isc__nm_async_tcpaccept(isc__networker_t *worker, isc__netievent_t *ev0);
void
isc__nm_async_tcpchildaccept(isc__networker_t *worker, isc__netievent_t *ev0);
void void
isc__nm_async_tcpstop(isc__networker_t *worker, isc__netievent_t *ev0); isc__nm_async_tcpstop(isc__networker_t *worker, isc__netievent_t *ev0);
void void
isc__nm_async_tcpchildstop(isc__networker_t *worker, isc__netievent_t *ev0);
void
isc__nm_async_tcpsend(isc__networker_t *worker, isc__netievent_t *ev0); isc__nm_async_tcpsend(isc__networker_t *worker, isc__netievent_t *ev0);
void void
isc__nm_async_startread(isc__networker_t *worker, isc__netievent_t *ev0); isc__nm_async_startread(isc__networker_t *worker, isc__netievent_t *ev0);

View File

@@ -603,8 +603,11 @@ process_queue(isc__networker_t *worker, isc_queue_t *queue) {
case netievent_tcplisten: case netievent_tcplisten:
isc__nm_async_tcplisten(worker, ievent); isc__nm_async_tcplisten(worker, ievent);
break; break;
case netievent_tcpchildlisten: case netievent_tcpchildaccept:
isc__nm_async_tcpchildlisten(worker, ievent); isc__nm_async_tcpchildaccept(worker, ievent);
break;
case netievent_tcpaccept:
isc__nm_async_tcpaccept(worker, ievent);
break; break;
case netievent_tcpstartread: case netievent_tcpstartread:
isc__nm_async_tcp_startread(worker, ievent); isc__nm_async_tcp_startread(worker, ievent);
@@ -618,9 +621,6 @@ process_queue(isc__networker_t *worker, isc_queue_t *queue) {
case netievent_tcpstop: case netievent_tcpstop:
isc__nm_async_tcpstop(worker, ievent); isc__nm_async_tcpstop(worker, ievent);
break; break;
case netievent_tcpchildstop:
isc__nm_async_tcpchildstop(worker, ievent);
break;
case netievent_tcpclose: case netievent_tcpclose:
isc__nm_async_tcpclose(worker, ievent); isc__nm_async_tcpclose(worker, ievent);
break; break;
@@ -922,6 +922,7 @@ isc__nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type,
sock->ah_size * sizeof(size_t)); sock->ah_size * sizeof(size_t));
sock->ah_handles = isc_mem_allocate( sock->ah_handles = isc_mem_allocate(
mgr->mctx, sock->ah_size * sizeof(isc_nmhandle_t *)); mgr->mctx, sock->ah_size * sizeof(isc_nmhandle_t *));
ISC_LINK_INIT(&sock->quotacb, link);
for (size_t i = 0; i < 32; i++) { for (size_t i = 0; i < 32; i++) {
sock->ah_frees[i] = i; sock->ah_frees[i] = i;
sock->ah_handles[i] = NULL; sock->ah_handles[i] = NULL;
@@ -939,7 +940,6 @@ isc__nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type,
break; break;
case isc_nm_tcpsocket: case isc_nm_tcpsocket:
case isc_nm_tcplistener: case isc_nm_tcplistener:
case isc_nm_tcpchildlistener:
if (family == AF_INET) { if (family == AF_INET) {
sock->statsindex = tcp4statsindex; sock->statsindex = tcp4statsindex;
} else { } else {

View File

@@ -68,10 +68,10 @@ read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf);
static void static void
tcp_close_cb(uv_handle_t *uvhandle); tcp_close_cb(uv_handle_t *uvhandle);
static void
stoplistening(isc_nmsocket_t *sock);
static void static void
tcp_listenclose_cb(uv_handle_t *handle); tcp_listenclose_cb(uv_handle_t *handle);
static isc_result_t
accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota);
static int static int
tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
@@ -167,11 +167,6 @@ isc_nm_listentcp(isc_nm_t *mgr, isc_nmiface_t *iface, isc_nm_cb_t cb,
nsock = isc_mem_get(mgr->mctx, sizeof(*nsock)); nsock = isc_mem_get(mgr->mctx, sizeof(*nsock));
isc__nmsocket_init(nsock, mgr, isc_nm_tcplistener, iface); isc__nmsocket_init(nsock, mgr, isc_nm_tcplistener, iface);
nsock->nchildren = mgr->nworkers;
atomic_init(&nsock->rchildren, mgr->nworkers);
nsock->children = isc_mem_get(mgr->mctx,
mgr->nworkers * sizeof(*nsock));
memset(nsock->children, 0, mgr->nworkers * sizeof(*nsock));
nsock->rcb.accept = cb; nsock->rcb.accept = cb;
nsock->rcbarg = cbarg; nsock->rcbarg = cbarg;
nsock->extrahandlesize = extrahandlesize; nsock->extrahandlesize = extrahandlesize;
@@ -216,15 +211,10 @@ isc_nm_listentcp(isc_nm_t *mgr, isc_nmiface_t *iface, isc_nm_cb_t cb,
} }
/* /*
* For multi-threaded TCP listening, we create a single "parent" socket, * For multi-threaded TCP listening, we create a single socket,
* bind to it, and then pass its uv_handle to a set of child sockets, one * bind to it, and start listening. On an incoming connection we accept
* per worker. For thread safety, the passing of the socket's uv_handle has * it, and then pass the accepted socket using the uv_export/uv_import
* to be done via IPC socket. * mechanism to a child thread.
*
* This design pattern is ugly but it's what's recommended by the libuv
* documentation. (A prior version of libuv had uv_export() and
* uv_import() functions which would have simplified this greatly, but
* they have been deprecated and removed.)
*/ */
void void
isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) { isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) {
@@ -236,24 +226,6 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) {
REQUIRE(isc__nm_in_netthread()); REQUIRE(isc__nm_in_netthread());
REQUIRE(sock->type == isc_nm_tcplistener); REQUIRE(sock->type == isc_nm_tcplistener);
/* Initialize children now to make cleaning up easier */
for (int i = 0; i < sock->nchildren; i++) {
isc_nmsocket_t *csock = &sock->children[i];
isc__nmsocket_init(csock, sock->mgr, isc_nm_tcpchildlistener,
sock->iface);
csock->parent = sock;
csock->tid = i;
csock->pquota = sock->pquota;
csock->backlog = sock->backlog;
csock->extrahandlesize = sock->extrahandlesize;
INSIST(csock->rcb.recv == NULL && csock->rcbarg == NULL);
csock->rcb.accept = sock->rcb.accept;
csock->rcbarg = sock->rcbarg;
csock->fd = -1;
}
r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp);
if (r != 0) { if (r != 0) {
/* It was never opened */ /* It was never opened */
@@ -295,39 +267,25 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) {
goto done; goto done;
} }
uv_handle_set_data(&sock->uv_handle.handle, sock);
/* /*
* For each worker, we send a 'tcpchildlisten' event with * The callback will run in the same thread uv_listen() was called
* the exported socket. * from, so a race with tcp_connection_cb() isn't possible.
*/ */
for (int i = 0; i < sock->nchildren; i++) { r = uv_listen((uv_stream_t *)&sock->uv_handle.tcp, sock->backlog,
isc_nmsocket_t *csock = &sock->children[i]; tcp_connection_cb);
isc__netievent_tcpchildlisten_t *event = NULL;
event = isc__nm_get_ievent(csock->mgr,
netievent_tcpchildlisten);
r = isc_uv_export(&sock->uv_handle.stream, &event->streaminfo);
if (r != 0) { if (r != 0) {
isc_log_write( isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR,
"uv_export failed: %s", "uv_listen failed: %s",
isc_result_totext(isc__nm_uverr2result(r))); isc_result_totext(isc__nm_uverr2result(r)));
isc__nm_put_ievent(sock->mgr, event); uv_close(&sock->uv_handle.handle, tcp_close_cb);
continue; sock->result = isc__nm_uverr2result(r);
} atomic_store(&sock->listen_error, true);
event->sock = csock; goto done;
if (csock->tid == isc_nm_tid()) {
isc__nm_async_tcpchildlisten(&sock->mgr->workers[i],
(isc__netievent_t *)event);
isc__nm_put_ievent(sock->mgr, event);
} else {
isc__nm_enqueue_ievent(&sock->mgr->workers[i],
(isc__netievent_t *)event);
}
} }
uv_handle_set_data(&sock->uv_handle.handle, sock);
atomic_store(&sock->listening, true); atomic_store(&sock->listening, true);
done: done:
@@ -337,43 +295,113 @@ done:
return; return;
} }
/* static void
* Connect to the parent socket and be ready to receive the uv_handle tcp_connection_cb(uv_stream_t *server, int status) {
* for the socket we'll be listening on. isc_nmsocket_t *psock = uv_handle_get_data((uv_handle_t *)server);
*/ isc_result_t result;
UNUSED(status);
result = accept_connection(psock, NULL);
if (result != ISC_R_SUCCESS && result != ISC_R_NOCONN) {
if ((result != ISC_R_QUOTA && result != ISC_R_SOFTQUOTA) ||
can_log_tcp_quota()) {
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR,
"TCP connection failed: %s",
isc_result_totext(result));
}
}
}
void void
isc__nm_async_tcpchildlisten(isc__networker_t *worker, isc__netievent_t *ev0) { isc__nm_async_tcpchildaccept(isc__networker_t *worker, isc__netievent_t *ev0) {
isc__netievent_tcpchildlisten_t *ievent = isc__netievent_tcpchildaccept_t *ievent =
(isc__netievent_tcpchildlisten_t *)ev0; (isc__netievent_tcpchildaccept_t *)ev0;
isc_nmsocket_t *sock = ievent->sock; isc_nmsocket_t *ssock = ievent->sock;
isc_nmsocket_t *csock = NULL;
isc_nmhandle_t *handle;
isc_result_t result;
struct sockaddr_storage ss;
isc_sockaddr_t local;
int r; int r;
REQUIRE(isc__nm_in_netthread()); REQUIRE(isc__nm_in_netthread());
REQUIRE(sock->type == isc_nm_tcpchildlistener); REQUIRE(ssock->type == isc_nm_tcplistener);
worker = &sock->mgr->workers[isc_nm_tid()]; csock = isc_mem_get(ssock->mgr->mctx, sizeof(isc_nmsocket_t));
isc__nmsocket_init(csock, ssock->mgr, isc_nm_tcpsocket, ssock->iface);
csock->tid = isc_nm_tid();
csock->extrahandlesize = ssock->extrahandlesize;
uv_tcp_init(&worker->loop, (uv_tcp_t *)&sock->uv_handle.tcp); csock->quota = ievent->quota;
uv_handle_set_data(&sock->uv_handle.handle, sock); ievent->quota = NULL;
r = isc_uv_import(&sock->uv_handle.stream, &ievent->streaminfo);
worker = &ssock->mgr->workers[isc_nm_tid()];
uv_tcp_init(&worker->loop, &csock->uv_handle.tcp);
r = isc_uv_import(&csock->uv_handle.stream, &ievent->streaminfo);
if (r != 0) { if (r != 0) {
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR,
"uv_import failed: %s", "uv_import failed: %s",
isc_result_totext(isc__nm_uverr2result(r))); isc_result_totext(isc__nm_uverr2result(r)));
return; result = isc__nm_uverr2result(r);
goto error;
} }
r = uv_listen((uv_stream_t *)&sock->uv_handle.tcp, sock->backlog, r = uv_tcp_getpeername(&csock->uv_handle.tcp, (struct sockaddr *)&ss,
tcp_connection_cb); &(int){ sizeof(ss) });
if (r != 0) { if (r != 0) {
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, result = isc__nm_uverr2result(r);
ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, goto error;
"uv_listen failed: %s",
isc_result_totext(isc__nm_uverr2result(r)));
return;
} }
result = isc_sockaddr_fromsockaddr(&csock->peer,
(struct sockaddr *)&ss);
if (result != ISC_R_SUCCESS) {
goto error;
}
r = uv_tcp_getsockname(&csock->uv_handle.tcp, (struct sockaddr *)&ss,
&(int){ sizeof(ss) });
if (r != 0) {
result = isc__nm_uverr2result(r);
goto error;
}
result = isc_sockaddr_fromsockaddr(&local, (struct sockaddr *)&ss);
if (result != ISC_R_SUCCESS) {
goto error;
}
isc_nmsocket_attach(ssock, &csock->server);
handle = isc__nmhandle_get(csock, NULL, &local);
INSIST(ssock->rcb.accept != NULL);
csock->read_timeout = ssock->mgr->init;
ssock->rcb.accept(handle, ISC_R_SUCCESS, ssock->rcbarg);
isc_nmsocket_detach(&csock);
return;
error:
/*
* Detach the quota early to make room for other connections;
* otherwise it'd be detached later asynchronously, and clog
* the quota unnecessarily.
*/
if (csock->quota != NULL) {
isc_quota_detach(&csock->quota);
}
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR,
ISC_LOG_ERROR, "Accepting TCP connection failed: %s",
isc_result_totext(result));
/*
* Detach the socket properly to make sure uv_close() is called.
*/
isc_nmsocket_detach(&csock);
} }
void void
@@ -411,83 +439,24 @@ isc__nm_async_tcpstop(isc__networker_t *worker, isc__netievent_t *ev0) {
isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
(isc__netievent_t *)event); (isc__netievent_t *)event);
} else { } else {
stoplistening(sock); uv_close((uv_handle_t *)&sock->uv_handle.tcp,
tcp_listenclose_cb);
isc__nm_drop_interlocked(sock->mgr); isc__nm_drop_interlocked(sock->mgr);
} }
} }
static void
stoplistening(isc_nmsocket_t *sock) {
for (int i = 0; i < sock->nchildren; i++) {
isc__netievent_tcpchildstop_t *event = NULL;
/*
* We can ignore the overhead of event allocation because
* stoplistening is a rare event, and doing it this way
* simplifies sock reference counting.
*/
event = isc__nm_get_ievent(sock->mgr, netievent_tcpchildstop);
isc_nmsocket_attach(&sock->children[i], &event->sock);
if (isc_nm_tid() == sock->children[i].tid) {
isc__nm_async_tcpchildstop(&sock->mgr->workers[i],
(isc__netievent_t *)event);
isc__nm_put_ievent(sock->mgr, event);
} else {
isc__nm_enqueue_ievent(&sock->mgr->workers[i],
(isc__netievent_t *)event);
}
}
LOCK(&sock->lock);
while (atomic_load_relaxed(&sock->rchildren) > 0) {
WAIT(&sock->cond, &sock->lock);
}
UNLOCK(&sock->lock);
uv_close((uv_handle_t *)&sock->uv_handle.tcp, tcp_listenclose_cb);
}
void
isc__nm_async_tcpchildstop(isc__networker_t *worker, isc__netievent_t *ev0) {
isc__netievent_tcpchildstop_t *ievent =
(isc__netievent_tcpchildstop_t *)ev0;
isc_nmsocket_t *sock = ievent->sock;
UNUSED(worker);
REQUIRE(VALID_NMSOCK(sock));
REQUIRE(isc_nm_tid() == sock->tid);
REQUIRE(sock->type == isc_nm_tcpchildlistener);
REQUIRE(sock->parent != NULL);
/*
* rchildren is atomic, but we still need to change it
* under a lock because the parent is waiting on conditional
* and without it we might deadlock.
*/
LOCK(&sock->parent->lock);
atomic_fetch_sub(&sock->parent->rchildren, 1);
UNLOCK(&sock->parent->lock);
uv_close((uv_handle_t *)&sock->uv_handle.tcp, tcp_listenclose_cb);
BROADCAST(&sock->parent->cond);
}
/* /*
* This callback is used for closing both child and parent listening * This callback is used for closing listening sockets.
* sockets; that's why we need to choose the proper lock.
*/ */
static void static void
tcp_listenclose_cb(uv_handle_t *handle) { tcp_listenclose_cb(uv_handle_t *handle) {
isc_nmsocket_t *sock = uv_handle_get_data(handle); isc_nmsocket_t *sock = uv_handle_get_data(handle);
isc_mutex_t *lock = ((sock->parent != NULL) ? &sock->parent->lock
: &sock->lock);
LOCK(lock); LOCK(&sock->lock);
atomic_store(&sock->closed, true); atomic_store(&sock->closed, true);
atomic_store(&sock->listening, false); atomic_store(&sock->listening, false);
sock->pquota = NULL; sock->pquota = NULL;
UNLOCK(lock); UNLOCK(&sock->lock);
isc_nmsocket_detach(&sock); isc_nmsocket_detach(&sock);
} }
@@ -683,130 +652,34 @@ read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) {
*/ */
} }
static isc_result_t static void
accept_connection(isc_nmsocket_t *ssock) { quota_accept_cb(isc_quota_t *quota, void *sock0) {
isc_result_t result; isc_nmsocket_t *sock = (isc_nmsocket_t *)sock0;
isc_quota_t *quota = NULL; isc__netievent_tcpaccept_t *ievent = NULL;
isc_nmsocket_t *csock = NULL;
isc__networker_t *worker = NULL;
isc_nmhandle_t *handle = NULL;
struct sockaddr_storage ss;
isc_sockaddr_t local;
int r;
bool overquota = false;
REQUIRE(VALID_NMSOCK(ssock)); REQUIRE(VALID_NMSOCK(sock));
REQUIRE(ssock->tid == isc_nm_tid());
if (!atomic_load_relaxed(&ssock->active) ||
atomic_load_relaxed(&ssock->mgr->closing))
{
/* We're closing, bail */
return (ISC_R_CANCELED);
}
if (ssock->pquota != NULL) {
result = isc_quota_attach(ssock->pquota, &quota);
/* /*
* We share the quota between all TCP sockets. Others * Create a tcpaccept event and pass it using the async channel.
* may have used up all the quota slots, in which case
* this socket could starve. So we only fail here if we
* already had at least one active connection on this
* socket. This guarantees that we'll maintain some level
* of service while over quota, and will resume normal
* service when the quota comes back down.
*/ */
if (result != ISC_R_SUCCESS) { ievent = isc__nm_get_ievent(sock->mgr, netievent_tcpaccept);
ssock->overquota++; ievent->sock = sock;
overquota = true; ievent->quota = quota;
if (ssock->conns > 0) { isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
isc__nm_incstats( (isc__netievent_t *)ievent);
ssock->mgr,
ssock->statsindex[STATID_ACCEPTFAIL]);
return (result);
}
}
}
isc__nm_incstats(ssock->mgr, ssock->statsindex[STATID_ACCEPT]);
csock = isc_mem_get(ssock->mgr->mctx, sizeof(isc_nmsocket_t));
isc__nmsocket_init(csock, ssock->mgr, isc_nm_tcpsocket, ssock->iface);
csock->tid = isc_nm_tid();
csock->extrahandlesize = ssock->extrahandlesize;
csock->quota = quota;
quota = NULL;
worker = &ssock->mgr->workers[isc_nm_tid()];
uv_tcp_init(&worker->loop, &csock->uv_handle.tcp);
r = uv_accept(&ssock->uv_handle.stream, &csock->uv_handle.stream);
if (r != 0) {
result = isc__nm_uverr2result(r);
goto error;
}
r = uv_tcp_getpeername(&csock->uv_handle.tcp, (struct sockaddr *)&ss,
&(int){ sizeof(ss) });
if (r != 0) {
result = isc__nm_uverr2result(r);
goto error;
}
result = isc_sockaddr_fromsockaddr(&csock->peer,
(struct sockaddr *)&ss);
if (result != ISC_R_SUCCESS) {
goto error;
}
r = uv_tcp_getsockname(&csock->uv_handle.tcp, (struct sockaddr *)&ss,
&(int){ sizeof(ss) });
if (r != 0) {
result = isc__nm_uverr2result(r);
goto error;
}
result = isc_sockaddr_fromsockaddr(&local, (struct sockaddr *)&ss);
if (result != ISC_R_SUCCESS) {
goto error;
}
isc_nmsocket_attach(ssock, &csock->server);
ssock->conns++;
handle = isc__nmhandle_get(csock, NULL, &local);
INSIST(ssock->rcb.accept != NULL);
csock->read_timeout = ssock->mgr->init;
ssock->rcb.accept(handle, ISC_R_SUCCESS, ssock->rcbarg);
isc_nmsocket_detach(&csock);
return (ISC_R_SUCCESS);
error:
/*
* Detach it early to make room for other connections, otherwise
* it'd be detached later asynchronously clogging the quota.
*/
if (csock->quota != NULL) {
isc_quota_detach(&csock->quota);
}
if (overquota) {
ssock->overquota--;
}
/* We need to detach it properly to make sure uv_close is called. */
isc_nmsocket_detach(&csock);
return (result);
} }
static void /*
tcp_connection_cb(uv_stream_t *server, int status) { * This is called after we get a quota_accept_cb() callback.
isc_nmsocket_t *ssock = uv_handle_get_data((uv_handle_t *)server); */
void
isc__nm_async_tcpaccept(isc__networker_t *worker, isc__netievent_t *ev0) {
isc_result_t result; isc_result_t result;
isc__netievent_tcpaccept_t *ievent = (isc__netievent_tcpaccept_t *)ev0;
UNUSED(status); REQUIRE(worker->id == ievent->sock->tid);
result = accept_connection(ssock); result = accept_connection(ievent->sock, ievent->quota);
if (result != ISC_R_SUCCESS && result != ISC_R_NOCONN) { if (result != ISC_R_SUCCESS && result != ISC_R_NOCONN) {
if ((result != ISC_R_QUOTA && result != ISC_R_SOFTQUOTA) || if ((result != ISC_R_QUOTA && result != ISC_R_SOFTQUOTA) ||
can_log_tcp_quota()) { can_log_tcp_quota()) {
@@ -816,6 +689,112 @@ tcp_connection_cb(uv_stream_t *server, int status) {
isc_result_totext(result)); isc_result_totext(result));
} }
} }
/*
* The socket was attached just before we called isc_quota_attach_cb().
*/
isc_nmsocket_detach(&ievent->sock);
}
/*
* Close callback for uv_tcp_t strutures created in accept_connection().
*/
static void
free_uvtcpt(uv_handle_t *uvs) {
isc_mem_t *mctx = (isc_mem_t *)uv_handle_get_data(uvs);
isc_mem_putanddetach(&mctx, uvs, sizeof(uv_tcp_t));
}
static isc_result_t
accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota) {
isc_result_t result;
isc__netievent_tcpchildaccept_t *event = NULL;
isc__networker_t *worker = NULL;
uv_tcp_t *uvstream = NULL;
isc_mem_t *mctx = NULL;
int r, w;
REQUIRE(VALID_NMSOCK(ssock));
REQUIRE(ssock->tid == isc_nm_tid());
if (!atomic_load_relaxed(&ssock->active) ||
atomic_load_relaxed(&ssock->mgr->closing))
{
/* We're closing, bail */
if (quota != NULL) {
isc_quota_detach(&quota);
}
return (ISC_R_CANCELED);
}
/* We can be called directly or as a callback from quota */
if (ssock->pquota != NULL && quota == NULL) {
/*
* We need to attach to ssock, because it might be queued
* waiting for a TCP quota slot. If so, then we'll detach it
* later when the connection is accepted. (XXX: This may be
* suboptimal, it might be better to attach unless
* we need to.)
*/
isc_nmsocket_t *tsock = NULL;
isc_nmsocket_attach(ssock, &tsock);
isc_quota_cb_init(&ssock->quotacb, quota_accept_cb, tsock);
result = isc_quota_attach_cb(ssock->pquota, &quota,
&ssock->quotacb);
if (result == ISC_R_QUOTA) {
isc__nm_incstats(ssock->mgr,
ssock->statsindex[STATID_ACCEPTFAIL]);
return (result);
}
/*
* We're under quota, so there's no need to wait;
* clear the quota callback and and detach the socket.
*/
isc_quota_cb_init(&ssock->quotacb, NULL, NULL);
isc_nmsocket_detach(&tsock);
}
isc__nm_incstats(ssock->mgr, ssock->statsindex[STATID_ACCEPT]);
worker = &ssock->mgr->workers[isc_nm_tid()];
uvstream = isc_mem_get(ssock->mgr->mctx, sizeof(uv_tcp_t));
isc_mem_attach(ssock->mgr->mctx, &mctx);
uv_handle_set_data((uv_handle_t *)uvstream, mctx);
mctx = NULL; /* Detached later in free_uvtcpt() */
uv_tcp_init(&worker->loop, uvstream);
r = uv_accept(&ssock->uv_handle.stream, (uv_stream_t *)uvstream);
if (r != 0) {
result = isc__nm_uverr2result(r);
uv_close((uv_handle_t *)uvstream, free_uvtcpt);
isc_quota_detach(&quota);
return (result);
}
/* We have an accepted TCP socket, pass it to a random worker */
w = isc_random_uniform(ssock->mgr->nworkers);
event = isc__nm_get_ievent(ssock->mgr, netievent_tcpchildaccept);
event->sock = ssock;
event->quota = quota;
r = isc_uv_export((uv_stream_t *)uvstream, &event->streaminfo);
RUNTIME_CHECK(r == 0);
uv_close((uv_handle_t *)uvstream, free_uvtcpt);
if (w == isc_nm_tid()) {
isc__nm_async_tcpchildaccept(&ssock->mgr->workers[w],
(isc__netievent_t *)event);
isc__nm_put_ievent(ssock->mgr, event);
} else {
isc__nm_enqueue_ievent(&ssock->mgr->workers[w],
(isc__netievent_t *)event);
}
return (ISC_R_SUCCESS);
} }
isc_result_t isc_result_t
@@ -943,30 +922,9 @@ tcp_close_direct(isc_nmsocket_t *sock) {
REQUIRE(VALID_NMSOCK(sock)); REQUIRE(VALID_NMSOCK(sock));
REQUIRE(sock->tid == isc_nm_tid()); REQUIRE(sock->tid == isc_nm_tid());
REQUIRE(sock->type == isc_nm_tcpsocket); REQUIRE(sock->type == isc_nm_tcpsocket);
isc_nmsocket_t *ssock = sock->server;
if (sock->quota != NULL) { if (sock->quota != NULL) {
isc_quota_detach(&sock->quota); isc_quota_detach(&sock->quota);
} }
if (ssock != NULL) {
ssock->conns--;
while (ssock->conns == 0 && ssock->overquota > 0) {
ssock->overquota--;
isc_result_t result = accept_connection(ssock);
if (result == ISC_R_SUCCESS || result == ISC_R_NOCONN) {
continue;
}
if ((result != ISC_R_QUOTA &&
result != ISC_R_SOFTQUOTA) ||
can_log_tcp_quota()) {
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_NETMGR,
ISC_LOG_ERROR,
"TCP connection failed: %s",
isc_result_totext(result));
}
}
}
if (sock->timer_initialized) { if (sock->timer_initialized) {
sock->timer_initialized = false; sock->timer_initialized = false;
uv_timer_stop(&sock->timer); uv_timer_stop(&sock->timer);