2
0
mirror of https://gitlab.isc.org/isc-projects/bind9 synced 2025-08-31 06:25:31 +00:00

netmgr: make tcp listening multithreaded.

When listening for TCP connections we create a socket, bind it
and then pass it over IPC to all threads - which then listen on
in and accept connections. This sounds broken, but it's the
official way of dealing with multithreaded TCP listeners in libuv,
and works on all platforms supported by libuv.
This commit is contained in:
Witold Kręcicki
2019-11-28 10:21:34 +01:00
committed by Evan Hunt
parent 09c2dbffb5
commit bc5aae1579
4 changed files with 325 additions and 27 deletions

View File

@@ -50,6 +50,21 @@ read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf);
static void
tcp_close_cb(uv_handle_t *uvhandle);
static void
ipc_connection_cb(uv_stream_t *stream, int status);
static void
ipc_write_cb(uv_write_t* uvreq, int status);
static void
parent_pipe_close_cb(uv_handle_t *handle);
static void
childlisten_ipc_connect_cb(uv_connect_t *uvreq, int status);
static void
childlisten_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf);
static void
stoplistening(isc_nmsocket_t *sock);
static void
tcp_listenclose_cb(uv_handle_t *handle);
static int
tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
isc__networker_t *worker;
@@ -71,7 +86,7 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
return (r);
}
}
sock->uv_handle.tcp.data = sock;
r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp,
&req->peer.type.sa, tcp_connect_cb);
return (r);
@@ -134,7 +149,6 @@ isc_nm_listentcp(isc_nm_t *mgr, isc_nmiface_t *iface,
isc_quota_t *quota,
isc_nmsocket_t **sockp)
{
isc__netievent_tcplisten_t *ievent = NULL;
isc_nmsocket_t *nsock = NULL;
REQUIRE(VALID_NM(mgr));
@@ -142,6 +156,11 @@ isc_nm_listentcp(isc_nm_t *mgr, isc_nmiface_t *iface,
nsock = isc_mem_get(mgr->mctx, sizeof(*nsock));
isc__nmsocket_init(nsock, mgr, isc_nm_tcplistener);
nsock->iface = iface;
nsock->nchildren = mgr->nworkers;
atomic_init(&nsock->rchildren, mgr->nworkers);
nsock->children = isc_mem_get(mgr->mctx,
mgr->nworkers * sizeof(*nsock));
memset(nsock->children, 0, mgr->nworkers * sizeof(*nsock));
nsock->rcb.accept = cb;
nsock->rcbarg = cbarg;
nsock->extrahandlesize = extrahandlesize;
@@ -156,18 +175,27 @@ isc_nm_listentcp(isc_nm_t *mgr, isc_nmiface_t *iface,
nsock->tid = isc_random_uniform(mgr->nworkers);
/*
* Listening to TCP is rare enough not to care about the
* added overhead from passing this to another thread.
*/
ievent = isc__nm_get_ievent(mgr, netievent_tcplisten);
* Listening to TCP is rare enough not to care about the
* added overhead from passing this to another thread.
*/
isc__netievent_tcplisten_t *ievent = isc__nm_get_ievent(mgr, netievent_tcplisten);
ievent->sock = nsock;
isc__nm_enqueue_ievent(&mgr->workers[nsock->tid],
(isc__netievent_t *) ievent);
*sockp = nsock;
return (ISC_R_SUCCESS);
}
/*
* For TCP listening we create a single socket, bind it, and then pass it
* to `ncpu` child sockets - the passing is done over IPC.
* XXXWPK This design pattern is ugly but it's "the way to do it" recommended
* by libuv documentation - which also mentions that there should be
* uv_export/uv_import functions which would simplify this greatly.
*/
void
isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ievent0) {
isc__netievent_tcplisten_t *ievent =
@@ -184,10 +212,56 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ievent0) {
}
uv_tcp_bind(&sock->uv_handle.tcp, &sock->iface->addr.type.sa, 0);
r = uv_listen((uv_stream_t *) &sock->uv_handle.tcp, sock->backlog,
tcp_connection_cb);
if (r != 0) {
return;
sock->uv_handle.tcp.data = sock;
/*
* This is not properly documented in libuv, and the example
* (benchmark-multi-accept) is wrong:
* 'ipc' parameter must be '0' for 'listening' IPC socket, '1'
* only for the sockets are really passing the FDs between
* threads. This works without any problems on Unices, but
* breaks horribly on Windows.
*/
r = uv_pipe_init(&worker->loop, &sock->ipc, 0);
INSIST(r == 0);
sock->ipc.data = sock;
r = uv_pipe_bind(&sock->ipc, sock->ipc_pipe_name);
INSIST(r == 0);
r = uv_listen((uv_stream_t *) &sock->ipc, sock->nchildren,
ipc_connection_cb);
INSIST(r == 0);
/*
* We launch n 'tcpchildlistener' that will receive
* sockets to be listened on over ipc.
*/
for (int i = 0; i < sock->nchildren; i++) {
isc__netievent_tcpchildlisten_t *event = NULL;
isc_nmsocket_t *csock = &sock->children[i];
isc__nmsocket_init(csock, sock->mgr, isc_nm_tcpchildlistener);
csock->parent = sock;
csock->iface = sock->iface;
csock->tid = i;
csock->pquota = sock->pquota;
csock->backlog = sock->backlog;
csock->extrahandlesize = sock->extrahandlesize;
INSIST(csock->rcb.recv == NULL && csock->rcbarg == NULL);
csock->rcb.accept = sock->rcb.accept;
csock->rcbarg = sock->rcbarg;
csock->fd = -1;
event = isc__nm_get_ievent(csock->mgr,
netievent_tcpchildlisten);
event->sock = csock;
if (csock->tid == isc_nm_tid()) {
isc__nm_async_tcpchildlisten(&sock->mgr->workers[i],
(isc__netievent_t *) event);
isc__nm_put_ievent(sock->mgr, event);
} else {
isc__nm_enqueue_ievent(&sock->mgr->workers[i],
(isc__netievent_t *) event);
}
}
atomic_store(&sock->listening, true);
@@ -195,6 +269,117 @@ isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ievent0) {
return;
}
/* Parent got an IPC connection from child */
static void
ipc_connection_cb(uv_stream_t *stream, int status) {
int r;
REQUIRE(status == 0);
isc_nmsocket_t *sock = stream->data;
isc__networker_t *worker = &sock->mgr->workers[isc_nm_tid()];
isc__nm_uvreq_t *nreq = isc__nm_uvreq_get(sock->mgr, sock);
/*
* The buffer can be anything, it will be ignored, but it has to
* be something that won't disappear.
*/
nreq->uvbuf = uv_buf_init((char *)nreq, 1);
uv_pipe_init(&worker->loop, &nreq->pipe, 1);
nreq->pipe.data = nreq;
/* Failure here is critical */
r = uv_accept((uv_stream_t *) &sock->ipc,
(uv_stream_t*) &nreq->pipe);
INSIST(r == 0);
r = uv_write2(&nreq->uv_req.write,
(uv_stream_t*) &nreq->pipe,
&nreq->uvbuf,
1,
(uv_stream_t*) &sock->uv_handle.stream,
ipc_write_cb);
INSIST(r == 0);
}
static void
ipc_write_cb(uv_write_t* uvreq, int status) {
UNUSED(status);
isc__nm_uvreq_t *req = uvreq->data;
/*
* We want all children to get the socket. If we're done we can stop
* listening on the IPC socket.
*/
if (atomic_fetch_add(&req->sock->schildren, 1) ==
req->sock->nchildren - 1) {
uv_close((uv_handle_t*) &req->sock->ipc, NULL);
}
uv_close((uv_handle_t*) &req->pipe, parent_pipe_close_cb);
}
static void
parent_pipe_close_cb(uv_handle_t *handle) {
isc__nm_uvreq_t *req = handle->data;
isc__nm_uvreq_put(&req, req->sock);
}
void
isc__nm_async_tcpchildlisten(isc__networker_t *worker, isc__netievent_t *ievent0) {
isc__netievent_tcplisten_t *ievent =
(isc__netievent_tcplisten_t *) ievent0;
isc_nmsocket_t *sock = ievent->sock;
int r;
REQUIRE(isc__nm_in_netthread());
REQUIRE(sock->type == isc_nm_tcpchildlistener);
r = uv_pipe_init(&worker->loop, &sock->ipc, 1);
INSIST(r == 0);
sock->ipc.data = sock;
isc__nm_uvreq_t * req = isc__nm_uvreq_get(sock->mgr, sock);
uv_pipe_connect(&req->uv_req.connect,
&sock->ipc,
sock->parent->ipc_pipe_name,
childlisten_ipc_connect_cb);
}
/* child connected to parent over IPC */
static void
childlisten_ipc_connect_cb(uv_connect_t *uvreq, int status) {
UNUSED(status);
isc__nm_uvreq_t *req = uvreq->data;
isc_nmsocket_t *sock = req->sock;
isc__nm_uvreq_put(&req, sock);
int r = uv_read_start((uv_stream_t*) &sock->ipc,
isc__nm_alloc_cb,
childlisten_read_cb);
INSIST(r == 0);
}
/* child got the socket over IPC */
static void
childlisten_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) {
UNUSED(nread);
int r;
isc_nmsocket_t *sock = stream->data;
REQUIRE(VALID_NMSOCK(sock));
REQUIRE(buf != NULL);
uv_pipe_t* ipc = (uv_pipe_t*) stream;
uv_handle_type type = uv_pipe_pending_type(ipc);
INSIST(type == UV_TCP);
isc__nm_free_uvbuf(sock, buf);
isc__networker_t * worker = &sock->mgr->workers[isc_nm_tid()];
uv_tcp_init(&worker->loop, (uv_tcp_t*) &sock->uv_handle.tcp);
sock->uv_handle.tcp.data = sock;
uv_accept(stream, &sock->uv_handle.stream);
r = uv_listen((uv_stream_t *) &sock->uv_handle.tcp, sock->backlog,
tcp_connection_cb);
uv_close((uv_handle_t*) ipc, NULL);
if (r != 0) {
/* XXX log it? */
return;
}
}
void
isc_nm_tcp_stoplistening(isc_nmsocket_t *sock) {
isc__netievent_tcpstoplisten_t *ievent = NULL;
@@ -208,21 +393,6 @@ isc_nm_tcp_stoplistening(isc_nmsocket_t *sock) {
(isc__netievent_t *) ievent);
}
static void
stoplistening_cb(uv_handle_t *handle) {
isc_nmsocket_t *sock = handle->data;
LOCK(&sock->lock);
atomic_store(&sock->listening, false);
atomic_store(&sock->closed, true);
SIGNAL(&sock->cond);
UNLOCK(&sock->lock);
sock->pquota = NULL;
isc_nmsocket_detach(&sock);
}
void
isc__nm_async_tcpstoplisten(isc__networker_t *worker,
isc__netievent_t *ievent0)
@@ -237,7 +407,97 @@ isc__nm_async_tcpstoplisten(isc__networker_t *worker,
REQUIRE(VALID_NMSOCK(sock));
REQUIRE(sock->type == isc_nm_tcplistener);
uv_close(&sock->uv_handle.handle, stoplistening_cb);
/*
* If network manager is interlocked, re-enqueue the event for later.
*/
if (!isc__nm_acquire_interlocked(sock->mgr)) {
isc__netievent_tcpstoplisten_t *event = NULL;
event = isc__nm_get_ievent(sock->mgr,
netievent_tcpstoplisten);
event->sock = sock;
isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
(isc__netievent_t *) event);
} else {
stoplistening(sock);
isc__nm_drop_interlocked(sock->mgr);
}
}
static void
stoplistening(isc_nmsocket_t *sock) {
for (int i = 0; i < sock->nchildren; i++) {
/*
* Stoplistening is a rare event, we can ignore the overhead
* caused by allocating an event, and doing it this way
* simplifies sock reference counting.
*/
isc__netievent_tcpstopchildlisten_t *event = NULL;
event = isc__nm_get_ievent(sock->mgr,
netievent_tcpstopchildlisten);
isc_nmsocket_attach(&sock->children[i], &event->sock);
if (i == sock->tid) {
isc__nm_async_tcpstopchildlisten(&sock->mgr->workers[i],
(isc__netievent_t *) event);
isc__nm_put_ievent(sock->mgr, event);
} else {
isc__nm_enqueue_ievent(&sock->mgr->workers[i],
(isc__netievent_t *) event);
}
}
LOCK(&sock->lock);
while (atomic_load_relaxed(&sock->rchildren) > 0) {
WAIT(&sock->cond, &sock->lock);
}
UNLOCK(&sock->lock);
uv_close((uv_handle_t *) &sock->uv_handle.tcp, tcp_listenclose_cb);
}
void
isc__nm_async_tcpstopchildlisten(isc__networker_t *worker,
isc__netievent_t *ievent0)
{
isc__netievent_tcpstoplisten_t *ievent =
(isc__netievent_tcpstoplisten_t *) ievent0;
isc_nmsocket_t *sock = ievent->sock;
UNUSED(worker);
REQUIRE(isc_nm_tid() == sock->tid);
REQUIRE(VALID_NMSOCK(sock));
REQUIRE(sock->type == isc_nm_tcpchildlistener);
REQUIRE(sock->parent != NULL);
/*
* rchildren is atomic but we still need to change it
* under a lock as the parent is waiting on conditional
* and without it we might deadlock.
*/
LOCK(&sock->parent->lock);
atomic_fetch_sub(&sock->parent->rchildren, 1);
UNLOCK(&sock->parent->lock);
uv_close((uv_handle_t *) &sock->uv_handle.tcp, tcp_listenclose_cb);
BROADCAST(&sock->parent->cond);
}
/*
* This callback is used for closing child and parent listening sockets -
* that's why we need to choose the proper lock.
*/
static void
tcp_listenclose_cb(uv_handle_t *handle) {
isc_nmsocket_t *sock = handle->data;
isc_mutex_t * lock = (sock->parent != NULL) ?
&sock->parent->lock : &sock->lock;
LOCK(lock);
atomic_store(&sock->closed, true);
atomic_store(&sock->listening, false);
sock->pquota = NULL;
UNLOCK(lock);
isc_nmsocket_detach(&sock);
}
static void