From c44423127d2573f7907649512d0a2bc6984a4215 Mon Sep 17 00:00:00 2001 From: Evan Hunt Date: Wed, 5 May 2021 14:54:53 -0700 Subject: [PATCH] fix shutdown deadlocks - ensure isc_nm_pause() and isc_nm_resume() work the same whether run from inside or outside of the netmgr. - promote 'stop' events to the priority event level so they can run while the netmgr is pausing or paused. - when pausing, drain the priority queue before acquiring an interlock; this prevents a deadlock when another thread is waiting for us to complete a task. - release interlock after pausing, reacquire it when resuming, so that stop events can happen. some incidental changes: - use a function to enqueue pause and resume events (this was part of a different change attempt that didn't work out; I kept it because I thought was more readable). - make mgr->nworkers a signed int to remove some annoying integer casts. --- lib/isc/netmgr/netmgr-int.h | 26 +++++----- lib/isc/netmgr/netmgr.c | 97 +++++++++++++++++++++++-------------- 2 files changed, 74 insertions(+), 49 deletions(-) diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index a126a4e403..e532b014cc 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -249,7 +249,6 @@ typedef enum isc__netievent_type { netievent_udpclose, netievent_udpsend, netievent_udpread, - netievent_udpstop, netievent_udpcancel, netievent_tcpconnect, @@ -258,7 +257,6 @@ typedef enum isc__netievent_type { netievent_tcpstartread, netievent_tcppauseread, netievent_tcpaccept, - netievent_tcpstop, netievent_tcpcancel, netievent_tcpdnsaccept, @@ -267,7 +265,6 @@ typedef enum isc__netievent_type { netievent_tcpdnssend, netievent_tcpdnsread, netievent_tcpdnscancel, - netievent_tcpdnsstop, netievent_tlsclose, netievent_tlssend, @@ -282,12 +279,10 @@ typedef enum isc__netievent_type { netievent_tlsdnssend, netievent_tlsdnsread, netievent_tlsdnscancel, - netievent_tlsdnsstop, netievent_tlsdnscycle, netievent_tlsdnsshutdown, netievent_httpclose, - netievent_httpstop, netievent_httpsend, netievent_shutdown, @@ -301,19 +296,26 @@ typedef enum isc__netievent_type { netievent_task, netievent_privilegedtask, - netievent_prio = 0xff, /* event type values higher than this - * will be treated as high-priority - * events, which can be processed - * while the netmgr is paused. - */ + /* + * event type values higher than this will be treated + * as high-priority events, which can be processed + * while the netmgr is pausing or paused. + */ + netievent_prio = 0xff, + netievent_udplisten, + netievent_udpstop, netievent_tcplisten, + netievent_tcpstop, netievent_tcpdnslisten, + netievent_tcpdnsstop, netievent_tlsdnslisten, + netievent_tlsdnsstop, + netievent_httpstop, + netievent_resume, netievent_detach, netievent_close, - } isc__netievent_type; typedef union { @@ -658,7 +660,7 @@ struct isc_nm { int magic; isc_refcount_t references; isc_mem_t *mctx; - uint32_t nworkers; + int nworkers; isc_mutex_t lock; isc_condition_t wkstatecond; isc_condition_t wkpausecond; diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index b795dd5aba..05bbae5b61 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -336,7 +336,7 @@ nm_destroy(isc_nm_t **mgr0) { mgr->magic = 0; - for (size_t i = 0; i < mgr->nworkers; i++) { + for (int i = 0; i < mgr->nworkers; i++) { isc__networker_t *worker = &mgr->workers[i]; isc__netievent_t *event = isc__nm_get_netievent_stop(mgr); isc__nm_enqueue_ievent(worker, event); @@ -348,7 +348,7 @@ nm_destroy(isc_nm_t **mgr0) { } UNLOCK(&mgr->lock); - for (size_t i = 0; i < mgr->nworkers; i++) { + for (int i = 0; i < mgr->nworkers; i++) { isc__networker_t *worker = &mgr->workers[i]; isc__netievent_t *ievent = NULL; int r; @@ -414,33 +414,45 @@ nm_destroy(isc_nm_t **mgr0) { #endif /* WIN32 */ } +static void +enqueue_pause(isc__networker_t *worker) { + isc__netievent_pause_t *event = + isc__nm_get_netievent_pause(worker->mgr); + isc__nm_enqueue_ievent(worker, (isc__netievent_t *)event); +} + +static void +isc__nm_async_pause(isc__networker_t *worker, isc__netievent_t *ev0) { + UNUSED(ev0); + REQUIRE(worker->paused == false); + + worker->paused = true; + uv_stop(&worker->loop); +} + void isc_nm_pause(isc_nm_t *mgr) { REQUIRE(VALID_NM(mgr)); - uint_fast32_t pausing = 0; REQUIRE(!atomic_load(&mgr->paused)); isc__nm_acquire_interlocked_force(mgr); - for (size_t i = 0; i < mgr->nworkers; i++) { + for (int i = 0; i < mgr->nworkers; i++) { isc__networker_t *worker = &mgr->workers[i]; - if (i != (size_t)isc_nm_tid()) { - isc__netievent_resume_t *event = - isc__nm_get_netievent_pause(mgr); - pausing++; - isc__nm_enqueue_ievent(worker, - (isc__netievent_t *)event); - } else { + if (i == isc_nm_tid()) { isc__nm_async_pause(worker, NULL); + } else { + enqueue_pause(worker); } } if (isc__nm_in_netthread()) { isc_barrier_wait(&mgr->pausing); + drain_priority_queue(&mgr->workers[isc_nm_tid()]); } LOCK(&mgr->lock); - while (atomic_load(&mgr->workers_paused) != pausing) { + while (atomic_load(&mgr->workers_paused) != mgr->workers_running) { WAIT(&mgr->wkstatecond, &mgr->lock); } UNLOCK(&mgr->lock); @@ -449,24 +461,39 @@ isc_nm_pause(isc_nm_t *mgr) { true)); } +static void +enqueue_resume(isc__networker_t *worker) { + isc__netievent_resume_t *event = + isc__nm_get_netievent_resume(worker->mgr); + isc__nm_enqueue_ievent(worker, (isc__netievent_t *)event); +} + +static void +isc__nm_async_resume(isc__networker_t *worker, isc__netievent_t *ev0) { + UNUSED(ev0); + REQUIRE(worker->paused == true); + + worker->paused = false; +} + void isc_nm_resume(isc_nm_t *mgr) { REQUIRE(VALID_NM(mgr)); REQUIRE(atomic_load(&mgr->paused)); - for (size_t i = 0; i < mgr->nworkers; i++) { + for (int i = 0; i < mgr->nworkers; i++) { isc__networker_t *worker = &mgr->workers[i]; - if (i != (size_t)isc_nm_tid()) { - isc__netievent_resume_t *event = - isc__nm_get_netievent_resume(mgr); - isc__nm_enqueue_ievent(worker, - (isc__netievent_t *)event); - } else { + if (i == isc_nm_tid()) { isc__nm_async_resume(worker, NULL); + } else { + enqueue_resume(worker); } } if (isc__nm_in_netthread()) { + drain_privilege_queue(&mgr->workers[isc_nm_tid()]); + + atomic_fetch_sub(&mgr->workers_paused, 1); isc_barrier_wait(&mgr->resuming); } @@ -512,7 +539,7 @@ isc__netmgr_shutdown(isc_nm_t *mgr) { REQUIRE(VALID_NM(mgr)); atomic_store(&mgr->closing, true); - for (size_t i = 0; i < mgr->nworkers; i++) { + for (int i = 0; i < mgr->nworkers; i++) { isc__netievent_t *event = NULL; event = isc__nm_get_netievent_shutdown(mgr); isc__nm_enqueue_ievent(&mgr->workers[i], event); @@ -742,23 +769,6 @@ isc__nm_async_stop(isc__networker_t *worker, isc__netievent_t *ev0) { uv_close((uv_handle_t *)&worker->async, NULL); } -static void -isc__nm_async_pause(isc__networker_t *worker, isc__netievent_t *ev0) { - UNUSED(ev0); - REQUIRE(worker->paused == false); - - worker->paused = true; - uv_stop(&worker->loop); -} - -static void -isc__nm_async_resume(isc__networker_t *worker, isc__netievent_t *ev0) { - UNUSED(ev0); - REQUIRE(worker->paused == true); - - worker->paused = false; -} - void isc_nm_task_enqueue(isc_nm_t *nm, isc_task_t *task, int threadid) { isc__netievent_t *event = NULL; @@ -2756,16 +2766,25 @@ isc__nm_async_shutdown(isc__networker_t *worker, isc__netievent_t *ev0) { bool isc__nm_acquire_interlocked(isc_nm_t *mgr) { + if (!isc__nm_in_netthread()) { + return (false); + } + LOCK(&mgr->lock); bool success = atomic_compare_exchange_strong( &mgr->interlocked, &(int){ ISC_NETMGR_NON_INTERLOCKED }, isc_nm_tid()); + UNLOCK(&mgr->lock); return (success); } void isc__nm_drop_interlocked(isc_nm_t *mgr) { + if (!isc__nm_in_netthread()) { + return; + } + LOCK(&mgr->lock); int tid = atomic_exchange(&mgr->interlocked, ISC_NETMGR_NON_INTERLOCKED); @@ -2776,6 +2795,10 @@ isc__nm_drop_interlocked(isc_nm_t *mgr) { void isc__nm_acquire_interlocked_force(isc_nm_t *mgr) { + if (!isc__nm_in_netthread()) { + return; + } + LOCK(&mgr->lock); while (!atomic_compare_exchange_strong( &mgr->interlocked, &(int){ ISC_NETMGR_NON_INTERLOCKED },