From 4f78f9d72a5dc3f6a596251e662d00cb80cd5e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Tue, 14 Dec 2021 21:49:53 +0100 Subject: [PATCH 1/5] Add #define ISC_OS_CACHELINE_SIZE 64 Add library ctor and dtor for isc_os compilation unit which initializes the numbers of the CPUs and also checks whether L1 cacheline size is really 64 if the sysconf() call is available. --- lib/isc/Makefile.am | 1 + lib/isc/include/isc/os.h | 8 ++++++++ lib/isc/lib.c | 3 +++ lib/isc/os.c | 39 ++++++++++++++++++++++++++------------- lib/isc/os_p.h | 24 ++++++++++++++++++++++++ util/copyrights | 1 + 6 files changed, 63 insertions(+), 13 deletions(-) create mode 100644 lib/isc/os_p.h diff --git a/lib/isc/Makefile.am b/lib/isc/Makefile.am index 70358f8a82..c6f800d5c6 100644 --- a/lib/isc/Makefile.am +++ b/lib/isc/Makefile.am @@ -168,6 +168,7 @@ libisc_la_SOURCES = \ openssl_shim.c \ openssl_shim.h \ os.c \ + os_p.h \ parseint.c \ pool.c \ portset.c \ diff --git a/lib/isc/include/isc/os.h b/lib/isc/include/isc/os.h index f9e90e0d82..96c9f0f3ea 100644 --- a/lib/isc/include/isc/os.h +++ b/lib/isc/include/isc/os.h @@ -14,9 +14,17 @@ /*! \file isc/os.h */ #include +#include ISC_LANG_BEGINDECLS +/*%< + * Hardcode the L1 cacheline size of the CPU to 64, this is checked in + * the os.c library constructor if operating system provide means to + * get the L1 cacheline size using sysconf(). + */ +#define ISC_OS_CACHELINE_SIZE 64 + unsigned int isc_os_ncpus(void); /*%< diff --git a/lib/isc/lib.c b/lib/isc/lib.c index 5f9173ff4c..e591b08780 100644 --- a/lib/isc/lib.c +++ b/lib/isc/lib.c @@ -19,6 +19,7 @@ #include "config.h" #include "mem_p.h" +#include "os_p.h" #include "tls_p.h" #include "trampoline_p.h" @@ -37,6 +38,7 @@ isc__shutdown(void) ISC_DESTRUCTOR; void isc__initialize(void) { + isc__os_initialize(); isc__mem_initialize(); isc__tls_initialize(); isc__trampoline_initialize(); @@ -48,4 +50,5 @@ isc__shutdown(void) { isc__trampoline_shutdown(); isc__tls_shutdown(); isc__mem_shutdown(); + isc__os_shutdown(); } diff --git a/lib/isc/os.c b/lib/isc/os.c index 6a75b60976..660d94af57 100644 --- a/lib/isc/os.c +++ b/lib/isc/os.c @@ -9,12 +9,15 @@ * information regarding copyright ownership. */ -#include +#include + #include +#include #include -static isc_once_t ncpus_once = ISC_ONCE_INIT; -static unsigned int ncpus = 0; +#include "os_p.h" + +static unsigned int isc__os_ncpus = 0; #ifdef HAVE_SYSCONF @@ -54,23 +57,33 @@ sysctl_ncpus(void) { static void ncpus_initialize(void) { #if defined(HAVE_SYSCONF) - ncpus = sysconf_ncpus(); + isc__os_ncpus = sysconf_ncpus(); #endif /* if defined(HAVE_SYSCONF) */ #if defined(HAVE_SYS_SYSCTL_H) && defined(HAVE_SYSCTLBYNAME) - if (ncpus <= 0) { - ncpus = sysctl_ncpus(); + if (isc__os_ncpus <= 0) { + isc__os_ncpus = sysctl_ncpus(); } #endif /* if defined(HAVE_SYS_SYSCTL_H) && defined(HAVE_SYSCTLBYNAME) */ - if (ncpus <= 0) { - ncpus = 1; + if (isc__os_ncpus == 0) { + isc__os_ncpus = 1; } } unsigned int isc_os_ncpus(void) { - isc_result_t result = isc_once_do(&ncpus_once, ncpus_initialize); - RUNTIME_CHECK(result == ISC_R_SUCCESS); - INSIST(ncpus > 0); - - return (ncpus); + return (isc__os_ncpus); +} + +void +isc__os_initialize(void) { + ncpus_initialize(); +#if defined(HAVE_SYSCONF) && defined(_SC_LEVEL1_DCACHE_LINESIZE) + long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + RUNTIME_CHECK((size_t)s == (size_t)ISC_OS_CACHELINE_SIZE); +#endif +} + +void +isc__os_shutdown(void) { + /* empty, but defined for completeness */; } diff --git a/lib/isc/os_p.h b/lib/isc/os_p.h new file mode 100644 index 0000000000..0c28a06975 --- /dev/null +++ b/lib/isc/os_p.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#pragma once + +#include + +#include + +/*! \file */ + +void +isc__os_initialize(void); + +void +isc__os_shutdown(void); diff --git a/util/copyrights b/util/copyrights index 150f956e9d..e57b83189e 100644 --- a/util/copyrights +++ b/util/copyrights @@ -1790,6 +1790,7 @@ ./lib/isc/openssl_shim.c C 2018,2019,2020,2021,2022 ./lib/isc/openssl_shim.h C 2020,2021,2022 ./lib/isc/os.c C 2000,2001,2004,2005,2007,2016,2018,2019,2020,2021,2022 +./lib/isc/os_p.h C 2022 ./lib/isc/parseint.c C 2001,2002,2003,2004,2005,2007,2012,2016,2018,2019,2020,2021,2022 ./lib/isc/pool.c C 2013,2015,2016,2018,2019,2020,2021,2022 ./lib/isc/portset.c C 2008,2016,2017,2018,2019,2020,2021,2022 From c917a2ca88b53617992840315cc92494b6a84147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Tue, 14 Dec 2021 21:50:23 +0100 Subject: [PATCH 2/5] Add isc_mem_*_aligned() function that works with aligned memory There are some situations where having aligned allocations would be useful, so we don't have to play tricks with padding the data to the cacheline sizes. Add isc_mem_{get,put,reget,putanddetach}_aligned() functions that has alignment and size as last argument mimicking the POSIX posix_memalign() functions on systems with jemalloc (see the documentation on MALLOX_ALIGN() for more details). On systems without jemalloc, those functions are same as non-aligned variants. --- lib/isc/include/isc/mem.h | 46 ++++++++++++++++++++++---------- lib/isc/jemalloc_shim.h | 3 +++ lib/isc/mem.c | 56 ++++++++++++++++++++++----------------- 3 files changed, 66 insertions(+), 39 deletions(-) diff --git a/lib/isc/include/isc/mem.h b/lib/isc/include/isc/mem.h index b90c3d6017..3e3b72324f 100644 --- a/lib/isc/include/isc/mem.h +++ b/lib/isc/include/isc/mem.h @@ -133,9 +133,13 @@ extern unsigned int isc_mem_defaultflags; #define ISCMEMFUNC(sfx) isc__mem_##sfx #define ISCMEMPOOLFUNC(sfx) isc__mempool_##sfx -#define isc_mem_get(c, s) ISCMEMFUNC(get)((c), (s)_ISC_MEM_FILELINE) +#define isc_mem_get(c, s) ISCMEMFUNC(get)((c), (s), 0 _ISC_MEM_FILELINE) +#define isc_mem_get_aligned(c, s, a) \ + ISCMEMFUNC(get)((c), (s), (a)_ISC_MEM_FILELINE) #define isc_mem_reget(c, p, o, n) \ - ISCMEMFUNC(reget)((c), (p), (o), (n)_ISC_MEM_FILELINE) + ISCMEMFUNC(reget)((c), (p), (o), (n), 0 _ISC_MEM_FILELINE) +#define isc_mem_reget_aligned(c, p, o, n, a) \ + ISCMEMFUNC(reget)((c), (p), (o), (n), (a)_ISC_MEM_FILELINE) #define isc_mem_allocate(c, s) ISCMEMFUNC(allocate)((c), (s)_ISC_MEM_FILELINE) #define isc_mem_reallocate(c, p, s) \ ISCMEMFUNC(reallocate)((c), (p), (s)_ISC_MEM_FILELINE) @@ -144,15 +148,27 @@ extern unsigned int isc_mem_defaultflags; ISCMEMFUNC(strndup)((c), (p), (l)_ISC_MEM_FILELINE) #define isc_mempool_get(c) ISCMEMPOOLFUNC(get)((c)_ISC_MEM_FILELINE) -#define isc_mem_put(c, p, s) \ - do { \ - ISCMEMFUNC(put)((c), (p), (s)_ISC_MEM_FILELINE); \ - (p) = NULL; \ +#define isc_mem_put(c, p, s) \ + do { \ + ISCMEMFUNC(put)((c), (p), (s), 0 _ISC_MEM_FILELINE); \ + (p) = NULL; \ } while (0) -#define isc_mem_putanddetach(c, p, s) \ - do { \ - ISCMEMFUNC(putanddetach)((c), (p), (s)_ISC_MEM_FILELINE); \ - (p) = NULL; \ +#define isc_mem_put_aligned(c, p, s, a) \ + do { \ + ISCMEMFUNC(put) \ + ((c), (p), (s), (a)_ISC_MEM_FILELINE); \ + (p) = NULL; \ + } while (0) +#define isc_mem_putanddetach(c, p, s) \ + do { \ + ISCMEMFUNC(putanddetach)((c), (p), (s), 0 _ISC_MEM_FILELINE); \ + (p) = NULL; \ + } while (0) +#define isc_mem_putanddetach_aligned(c, p, s, a) \ + do { \ + ISCMEMFUNC(putanddetach) \ + ((c), (p), (s), (a)_ISC_MEM_FILELINE); \ + (p) = NULL; \ } while (0) #define isc_mem_free(c, p) \ do { \ @@ -477,15 +493,17 @@ isc_mempool_setfillcount(isc_mempool_t *restrict mpctx, /* * Pseudo-private functions for use via macros. Do not call directly. */ -void ISCMEMFUNC(putanddetach)(isc_mem_t **, void *, size_t _ISC_MEM_FLARG); -void ISCMEMFUNC(put)(isc_mem_t *, void *, size_t _ISC_MEM_FLARG); +void ISCMEMFUNC(putanddetach)(isc_mem_t **, void *, size_t, + size_t _ISC_MEM_FLARG); +void ISCMEMFUNC(put)(isc_mem_t *, void *, size_t, size_t _ISC_MEM_FLARG); void ISCMEMFUNC(free)(isc_mem_t *, void *_ISC_MEM_FLARG); ISC_ATTR_MALLOC_DEALLOCATOR_IDX(ISCMEMFUNC(put), 2) -void *ISCMEMFUNC(get)(isc_mem_t *, size_t _ISC_MEM_FLARG); +void *ISCMEMFUNC(get)(isc_mem_t *, size_t, size_t _ISC_MEM_FLARG); ISC_ATTR_DEALLOCATOR_IDX(ISCMEMFUNC(put), 2) -void *ISCMEMFUNC(reget)(isc_mem_t *, void *, size_t, size_t _ISC_MEM_FLARG); +void *ISCMEMFUNC(reget)(isc_mem_t *, void *, size_t, size_t, + size_t _ISC_MEM_FLARG); ISC_ATTR_MALLOC_DEALLOCATOR_IDX(ISCMEMFUNC(free), 2) void *ISCMEMFUNC(allocate)(isc_mem_t *, size_t _ISC_MEM_FLARG); diff --git a/lib/isc/jemalloc_shim.h b/lib/isc/jemalloc_shim.h index b207eabc89..20801def01 100644 --- a/lib/isc/jemalloc_shim.h +++ b/lib/isc/jemalloc_shim.h @@ -19,6 +19,9 @@ const char *malloc_conf = NULL; +/* Without jemalloc, isc_mem_get_align() is equal to isc_mem_get() */ +#define MALLOCX_ALIGN(a) (a & 0) /* Clear the flag */ + #if defined(HAVE_MALLOC_SIZE) || defined(HAVE_MALLOC_USABLE_SIZE) #include diff --git a/lib/isc/mem.c b/lib/isc/mem.c index 0fa088a0e1..7bbbad3254 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -328,16 +329,18 @@ unlock: s = ZERO_ALLOCATION_SIZE; \ } +#define MEM_ALIGN(a) ((a) ? MALLOCX_ALIGN(a) : 0) + /*! * Perform a malloc, doing memory filling and overrun detection as necessary. */ static inline void * -mem_get(isc_mem_t *ctx, size_t size) { +mem_get(isc_mem_t *ctx, size_t size, int flags) { char *ret = NULL; ADJUST_ZERO_ALLOCATION_SIZE(size); - ret = mallocx(size, 0); + ret = mallocx(size, flags); INSIST(ret != NULL); if ((ctx->flags & ISC_MEMFLAG_FILL) != 0) { @@ -352,22 +355,23 @@ mem_get(isc_mem_t *ctx, size_t size) { */ /* coverity[+free : arg-1] */ static inline void -mem_put(isc_mem_t *ctx, void *mem, size_t size) { +mem_put(isc_mem_t *ctx, void *mem, size_t size, int flags) { ADJUST_ZERO_ALLOCATION_SIZE(size); if ((ctx->flags & ISC_MEMFLAG_FILL) != 0) { memset(mem, 0xde, size); /* Mnemonic for "dead". */ } - sdallocx(mem, size, 0); + sdallocx(mem, size, flags); } static inline void * -mem_realloc(isc_mem_t *ctx, void *old_ptr, size_t old_size, size_t new_size) { +mem_realloc(isc_mem_t *ctx, void *old_ptr, size_t old_size, size_t new_size, + int flags) { void *new_ptr = NULL; ADJUST_ZERO_ALLOCATION_SIZE(new_size); - new_ptr = rallocx(old_ptr, new_size, 0); + new_ptr = rallocx(old_ptr, new_size, flags); INSIST(new_ptr != NULL); if ((ctx->flags & ISC_MEMFLAG_FILL) != 0) { @@ -453,7 +457,7 @@ mem_create(isc_mem_t **ctxp, unsigned int flags) { REQUIRE(ctxp != NULL && *ctxp == NULL); - ctx = mallocx(sizeof(*ctx), 0); + ctx = mallocx(sizeof(*ctx), MALLOCX_ALIGN(ISC_OS_CACHELINE_SIZE)); INSIST(ctx != NULL); *ctx = (isc_mem_t){ @@ -572,7 +576,7 @@ destroy(isc_mem_t *ctx) { if (ctx->checkfree) { INSIST(malloced == 0); } - sdallocx(ctx, sizeof(*ctx), 0); + sdallocx(ctx, sizeof(*ctx), MALLOCX_ALIGN(ISC_OS_CACHELINE_SIZE)); } void @@ -617,7 +621,8 @@ isc__mem_detach(isc_mem_t **ctxp FLARG) { */ void -isc__mem_putanddetach(isc_mem_t **ctxp, void *ptr, size_t size FLARG) { +isc__mem_putanddetach(isc_mem_t **ctxp, void *ptr, size_t size, + size_t alignment FLARG) { isc_mem_t *ctx = NULL; REQUIRE(ctxp != NULL && VALID_CONTEXT(*ctxp)); @@ -630,7 +635,7 @@ isc__mem_putanddetach(isc_mem_t **ctxp, void *ptr, size_t size FLARG) { DELETE_TRACE(ctx, ptr, size, file, line); mem_putstats(ctx, ptr, size); - mem_put(ctx, ptr, size); + mem_put(ctx, ptr, size, MEM_ALIGN(alignment)); if (isc_refcount_decrement(&ctx->references) == 1) { isc_refcount_destroy(&ctx->references); @@ -745,12 +750,12 @@ lo_water(isc_mem_t *ctx) { } void * -isc__mem_get(isc_mem_t *ctx, size_t size FLARG) { +isc__mem_get(isc_mem_t *ctx, size_t size, size_t alignment FLARG) { void *ptr = NULL; REQUIRE(VALID_CONTEXT(ctx)); - ptr = mem_get(ctx, size); + ptr = mem_get(ctx, size, MEM_ALIGN(alignment)); mem_getstats(ctx, size); ADD_TRACE(ctx, ptr, size, file, line); @@ -761,13 +766,13 @@ isc__mem_get(isc_mem_t *ctx, size_t size FLARG) { } void -isc__mem_put(isc_mem_t *ctx, void *ptr, size_t size FLARG) { +isc__mem_put(isc_mem_t *ctx, void *ptr, size_t size, size_t alignment FLARG) { REQUIRE(VALID_CONTEXT(ctx)); DELETE_TRACE(ctx, ptr, size, file, line); mem_putstats(ctx, ptr, size); - mem_put(ctx, ptr, size); + mem_put(ctx, ptr, size, MEM_ALIGN(alignment)); CALL_LO_WATER(ctx); } @@ -884,7 +889,7 @@ isc__mem_allocate(isc_mem_t *ctx, size_t size FLARG) { REQUIRE(VALID_CONTEXT(ctx)); - ptr = mem_get(ctx, size); + ptr = mem_get(ctx, size, 0); /* Recalculate the real allocated size */ size = sallocx(ptr, 0); @@ -898,20 +903,21 @@ isc__mem_allocate(isc_mem_t *ctx, size_t size FLARG) { } void * -isc__mem_reget(isc_mem_t *ctx, void *old_ptr, size_t old_size, - size_t new_size FLARG) { +isc__mem_reget(isc_mem_t *ctx, void *old_ptr, size_t old_size, size_t new_size, + size_t alignment FLARG) { void *new_ptr = NULL; if (old_ptr == NULL) { REQUIRE(old_size == 0); - new_ptr = isc__mem_get(ctx, new_size FLARG_PASS); + new_ptr = isc__mem_get(ctx, new_size, alignment FLARG_PASS); } else if (new_size == 0) { - isc__mem_put(ctx, old_ptr, old_size FLARG_PASS); + isc__mem_put(ctx, old_ptr, old_size, alignment FLARG_PASS); } else { DELETE_TRACE(ctx, old_ptr, old_size, file, line); mem_putstats(ctx, old_ptr, old_size); - new_ptr = mem_realloc(ctx, old_ptr, old_size, new_size); + new_ptr = mem_realloc(ctx, old_ptr, old_size, new_size, + MEM_ALIGN(alignment)); mem_getstats(ctx, new_size); ADD_TRACE(ctx, new_ptr, new_size, file, line); @@ -944,7 +950,7 @@ isc__mem_reallocate(isc_mem_t *ctx, void *old_ptr, size_t new_size FLARG) { DELETE_TRACE(ctx, old_ptr, old_size, file, line); mem_putstats(ctx, old_ptr, old_size); - new_ptr = mem_realloc(ctx, old_ptr, old_size, new_size); + new_ptr = mem_realloc(ctx, old_ptr, old_size, new_size, 0); /* Recalculate the real allocated size */ new_size = sallocx(new_ptr, 0); @@ -975,7 +981,7 @@ isc__mem_free(isc_mem_t *ctx, void *ptr FLARG) { DELETE_TRACE(ctx, ptr, size, file, line); mem_putstats(ctx, ptr, size); - mem_put(ctx, ptr, size); + mem_put(ctx, ptr, size, 0); CALL_LO_WATER(ctx); } @@ -1243,7 +1249,7 @@ isc__mempool_destroy(isc_mempool_t **restrict mpctxp FLARG) { mpctx->items = item->next; mem_putstats(mctx, item, mpctx->size); - mem_put(mctx, item, mpctx->size); + mem_put(mctx, item, mpctx->size, 0); } /* @@ -1278,7 +1284,7 @@ isc__mempool_get(isc_mempool_t *restrict mpctx FLARG) { * We need to dip into the well. Fill up our free list. */ for (size_t i = 0; i < fillcount; i++) { - item = mem_get(mctx, mpctx->size); + item = mem_get(mctx, mpctx->size, 0); mem_getstats(mctx, mpctx->size); item->next = mpctx->items; mpctx->items = item; @@ -1326,7 +1332,7 @@ isc__mempool_put(isc_mempool_t *restrict mpctx, void *mem FLARG) { */ if (freecount >= freemax) { mem_putstats(mctx, mem, mpctx->size); - mem_put(mctx, mem, mpctx->size); + mem_put(mctx, mem, mpctx->size, 0); return; } From c84eb550491dcb6b72c581c72d7bb3174ac40658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Thu, 9 Dec 2021 11:40:02 +0100 Subject: [PATCH 3/5] Reduce the memory used by hazard pointers The hazard pointers implementation was bit of frivolous with memory usage allocating memory based on maximum constants rather than on the usage. Make the retired list bit use exactly the memory needed for specified number of hazard pointers. This reduced the memory used by hazard pointers to one quarter in our specific case because we only use single HP in the queue implementation (as opposed to allocating memory for HP_MAX_HPS = 4). Previously, the alignment to prevent false sharing was double the cacheline size. This was copied from the ConcurrencyFreaks implementation, but one cacheline size is enough to prevent false sharing, so we are using this now to save few bits of memory. The top level hazard pointers and retired list arrays are now not aligned to the cacheline size - they are read-only for the whole life-time of the isc_hp object. Only hp (hazard pointer) and rl (retired list) array members are allocated aligned to the cacheline size to avoid false sharing between threads. Cleanup HP_MAX_HPS and HP_THRESHOLD_R constants from the paper, because we don't use them in the code. HP_THRESHOLD_R was 0, so the check whether the retired list size was smaller than the value was basically a dead code. --- lib/isc/hp.c | 98 +++++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 44 deletions(-) diff --git a/lib/isc/hp.c b/lib/isc/hp.c index aaa59f6c45..5cd64d4f75 100644 --- a/lib/isc/hp.c +++ b/lib/isc/hp.c @@ -45,34 +45,32 @@ #include +#include #include #include #include #include +#include #include #include #include -#define HP_MAX_THREADS 128 static int isc__hp_max_threads = 1; -#define HP_MAX_HPS 4 /* This is named 'K' in the HP paper */ -#define CLPAD (128 / sizeof(uintptr_t)) -#define HP_THRESHOLD_R 0 /* This is named 'R' in the HP paper */ - -/* Maximum number of retired objects per thread */ -static int isc__hp_max_retired = HP_MAX_THREADS * HP_MAX_HPS; typedef struct retirelist { int size; uintptr_t *list; } retirelist_t; +typedef atomic_uintptr_t isc_hp_uintptr_t; + struct isc_hp { int max_hps; + int max_retired; isc_mem_t *mctx; - atomic_uintptr_t **hp; - retirelist_t **rl; isc_hp_deletefunc_t *deletefunc; + isc_hp_uintptr_t **hp; + retirelist_t **rl; }; static inline int @@ -89,7 +87,6 @@ isc_hp_init(int max_threads) { } isc__hp_max_threads = max_threads; - isc__hp_max_retired = max_threads * HP_MAX_HPS; } isc_hp_t * @@ -97,29 +94,42 @@ isc_hp_new(isc_mem_t *mctx, size_t max_hps, isc_hp_deletefunc_t *deletefunc) { isc_hp_t *hp = isc_mem_get(mctx, sizeof(*hp)); REQUIRE(isc__hp_max_threads > 0); - REQUIRE(max_hps <= HP_MAX_HPS); + REQUIRE(max_hps > 0); - if (max_hps == 0) { - max_hps = HP_MAX_HPS; - } - - *hp = (isc_hp_t){ .max_hps = max_hps, .deletefunc = deletefunc }; + *hp = (isc_hp_t){ + .max_hps = max_hps, + .max_retired = isc__hp_max_threads * max_hps, + .deletefunc = deletefunc, + }; isc_mem_attach(mctx, &hp->mctx); hp->hp = isc_mem_get(mctx, isc__hp_max_threads * sizeof(hp->hp[0])); + for (int i = 0; i < isc__hp_max_threads; i++) { + isc_hp_uintptr_t *hps; + + hps = isc_mem_get_aligned(mctx, hp->max_hps * sizeof(*hps), + ISC_OS_CACHELINE_SIZE); + for (int j = 0; j < hp->max_hps; j++) { + atomic_init(&hps[j], 0); + } + + hp->hp[i] = hps; + } + hp->rl = isc_mem_get(mctx, isc__hp_max_threads * sizeof(hp->rl[0])); for (int i = 0; i < isc__hp_max_threads; i++) { - hp->hp[i] = isc_mem_get(mctx, CLPAD * 2 * sizeof(hp->hp[i][0])); - hp->rl[i] = isc_mem_get(mctx, sizeof(*hp->rl[0])); - *hp->rl[i] = (retirelist_t){ .size = 0 }; + retirelist_t *rl; - for (int j = 0; j < hp->max_hps; j++) { - atomic_init(&hp->hp[i][j], 0); - } - hp->rl[i]->list = isc_mem_get( - hp->mctx, isc__hp_max_retired * sizeof(uintptr_t)); + rl = isc_mem_get_aligned(mctx, sizeof(*rl), + ISC_OS_CACHELINE_SIZE); + rl->size = 0; + rl->list = isc_mem_get(hp->mctx, + hp->max_retired * sizeof(uintptr_t)); + memset(rl->list, 0, hp->max_retired * sizeof(uintptr_t)); + + hp->rl[i] = rl; } return (hp); @@ -128,16 +138,21 @@ isc_hp_new(isc_mem_t *mctx, size_t max_hps, isc_hp_deletefunc_t *deletefunc) { void isc_hp_destroy(isc_hp_t *hp) { for (int i = 0; i < isc__hp_max_threads; i++) { - isc_mem_put(hp->mctx, hp->hp[i], - CLPAD * 2 * sizeof(hp->hp[i][0])); + retirelist_t *rl = hp->rl[i]; - for (int j = 0; j < hp->rl[i]->size; j++) { - void *data = (void *)hp->rl[i]->list[j]; + for (int j = 0; j < rl->size; j++) { + void *data = (void *)rl->list[j]; hp->deletefunc(data); } - isc_mem_put(hp->mctx, hp->rl[i]->list, - isc__hp_max_retired * sizeof(uintptr_t)); - isc_mem_put(hp->mctx, hp->rl[i], sizeof(*hp->rl[0])); + isc_mem_put(hp->mctx, rl->list, + hp->max_retired * sizeof(uintptr_t)); + isc_mem_put_aligned(hp->mctx, rl, sizeof(*rl), + ISC_OS_CACHELINE_SIZE); + } + for (int i = 0; i < isc__hp_max_threads; i++) { + isc_hp_uintptr_t *hps = hp->hp[i]; + isc_mem_put_aligned(hp->mctx, hps, hp->max_hps * sizeof(*hps), + ISC_OS_CACHELINE_SIZE); } isc_mem_put(hp->mctx, hp->hp, isc__hp_max_threads * sizeof(hp->hp[0])); isc_mem_put(hp->mctx, hp->rl, isc__hp_max_threads * sizeof(hp->rl[0])); @@ -182,15 +197,12 @@ isc_hp_protect_release(isc_hp_t *hp, int ihp, atomic_uintptr_t ptr) { void isc_hp_retire(isc_hp_t *hp, uintptr_t ptr) { - hp->rl[tid()]->list[hp->rl[tid()]->size++] = ptr; - INSIST(hp->rl[tid()]->size < isc__hp_max_retired); + retirelist_t *rl = hp->rl[tid()]; + rl->list[rl->size++] = ptr; + INSIST(rl->size < hp->max_retired); - if (hp->rl[tid()]->size < HP_THRESHOLD_R) { - return; - } - - for (int iret = 0; iret < hp->rl[tid()]->size; iret++) { - uintptr_t obj = hp->rl[tid()]->list[iret]; + for (int iret = 0; iret < rl->size; iret++) { + uintptr_t obj = rl->list[iret]; bool can_delete = true; for (int itid = 0; itid < isc__hp_max_threads && can_delete; itid++) { @@ -203,11 +215,9 @@ isc_hp_retire(isc_hp_t *hp, uintptr_t ptr) { } if (can_delete) { - size_t bytes = (hp->rl[tid()]->size - iret) * - sizeof(hp->rl[tid()]->list[0]); - memmove(&hp->rl[tid()]->list[iret], - &hp->rl[tid()]->list[iret + 1], bytes); - hp->rl[tid()]->size--; + size_t bytes = (rl->size - iret) * sizeof(rl->list[0]); + memmove(&rl->list[iret], &rl->list[iret + 1], bytes); + rl->size--; hp->deletefunc((void *)obj); } } From 6269fce0fe9d234345374d33d33cad92c3195c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Tue, 14 Dec 2021 22:16:39 +0100 Subject: [PATCH 4/5] Use isc_mem_get_aligned() for isc_queue and cleanup max_threads The isc_queue_new() was using dirty tricks to allocate the head and tail members of the struct aligned to the cacheline. We can now use isc_mem_get_aligned() to allocate the structure to the cacheline directly. Use ISC_OS_CACHELINE_SIZE (64) instead of arbitrary ALIGNMENT (128), one cacheline size is enough to prevent false sharing. Cleanup the unused max_threads variable - there was actually no limit on the maximum number of threads. This was changed a while ago. --- lib/isc/include/isc/queue.h | 2 +- lib/isc/netmgr/netmgr.c | 2 +- lib/isc/queue.c | 41 ++++++++++--------------------------- 3 files changed, 13 insertions(+), 32 deletions(-) diff --git a/lib/isc/include/isc/queue.h b/lib/isc/include/isc/queue.h index a48af88881..1e2821c878 100644 --- a/lib/isc/include/isc/queue.h +++ b/lib/isc/include/isc/queue.h @@ -15,7 +15,7 @@ typedef struct isc_queue isc_queue_t; isc_queue_t * -isc_queue_new(isc_mem_t *mctx, int max_threads); +isc_queue_new(isc_mem_t *mctx); /*%< * Create a new fetch-and-add array queue. * diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index 51045521a1..e5e9a120bc 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -312,7 +312,7 @@ isc__netmgr_create(isc_mem_t *mctx, uint32_t workers, isc_nm_t **netmgrp) { isc_condition_init(&worker->cond_prio); for (size_t type = 0; type < NETIEVENT_MAX; type++) { - worker->ievents[type] = isc_queue_new(mgr->mctx, 128); + worker->ievents[type] = isc_queue_new(mgr->mctx); atomic_init(&worker->nievents[type], 0); } diff --git a/lib/isc/queue.c b/lib/isc/queue.c index d16c8e69ae..590bc92a53 100644 --- a/lib/isc/queue.c +++ b/lib/isc/queue.c @@ -15,15 +15,12 @@ #include #include #include +#include #include #include #define BUFFER_SIZE 1024 -#define MAX_THREADS 128 - -#define ALIGNMENT 128 - static uintptr_t nulluintptr = (uintptr_t)NULL; typedef struct node { @@ -39,13 +36,11 @@ typedef struct node { #define HP_HEAD 0 struct isc_queue { - alignas(ALIGNMENT) atomic_uintptr_t head; - alignas(ALIGNMENT) atomic_uintptr_t tail; + alignas(ISC_OS_CACHELINE_SIZE) atomic_uintptr_t head; + alignas(ISC_OS_CACHELINE_SIZE) atomic_uintptr_t tail; isc_mem_t *mctx; - int max_threads; int taken; isc_hp_t *hp; - void *alloced_ptr; }; static node_t * @@ -93,27 +88,14 @@ queue_cas_head(isc_queue_t *queue, node_t *cmp, const node_t *val) { } isc_queue_t * -isc_queue_new(isc_mem_t *mctx, int max_threads) { +isc_queue_new(isc_mem_t *mctx) { isc_queue_t *queue = NULL; node_t *sentinel = NULL; - void *qbuf = NULL; - uintptr_t qptr; - /* - * A trick to allocate an aligned isc_queue_t structure - */ - qbuf = isc_mem_get(mctx, sizeof(*queue) + ALIGNMENT); - qptr = (uintptr_t)qbuf; - queue = (isc_queue_t *)(qptr + (ALIGNMENT - (qptr % ALIGNMENT))); + queue = isc_mem_get_aligned(mctx, sizeof(*queue), + ISC_OS_CACHELINE_SIZE); - if (max_threads == 0) { - max_threads = MAX_THREADS; - } - - *queue = (isc_queue_t){ - .max_threads = max_threads, - .alloced_ptr = qbuf, - }; + *queue = (isc_queue_t){ 0 }; isc_mem_attach(mctx, &queue->mctx); @@ -137,7 +119,7 @@ isc_queue_enqueue(isc_queue_t *queue, uintptr_t item) { uint_fast32_t idx; uintptr_t n = nulluintptr; - lt = (node_t *)isc_hp_protect(queue->hp, 0, &queue->tail); + lt = (node_t *)isc_hp_protect(queue->hp, HP_TAIL, &queue->tail); idx = atomic_fetch_add(<->enqidx, 1); if (idx > BUFFER_SIZE - 1) { node_t *lnext = NULL; @@ -178,7 +160,7 @@ isc_queue_dequeue(isc_queue_t *queue) { uint_fast32_t idx; uintptr_t item; - lh = (node_t *)isc_hp_protect(queue->hp, 0, &queue->head); + lh = (node_t *)isc_hp_protect(queue->hp, HP_HEAD, &queue->head); if (atomic_load(&lh->deqidx) >= atomic_load(&lh->enqidx) && atomic_load(&lh->next) == nulluintptr) { @@ -215,7 +197,6 @@ isc_queue_dequeue(isc_queue_t *queue) { void isc_queue_destroy(isc_queue_t *queue) { node_t *last = NULL; - void *alloced = NULL; REQUIRE(queue != NULL); @@ -227,6 +208,6 @@ isc_queue_destroy(isc_queue_t *queue) { node_destroy(last); isc_hp_destroy(queue->hp); - alloced = queue->alloced_ptr; - isc_mem_putanddetach(&queue->mctx, alloced, sizeof(*queue) + ALIGNMENT); + isc_mem_putanddetach_aligned(&queue->mctx, queue, sizeof(*queue), + ISC_OS_CACHELINE_SIZE); } From d026ddde822efed49c5fd4e28f67403782da7f08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 15 Dec 2021 11:16:46 +0100 Subject: [PATCH 5/5] Add unit test of aligned isc_mem functions Add unit test that checks whether all the aligned functions work and that allocators return memory aligned at the specified boundary. --- lib/isc/tests/mem_test.c | 47 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/lib/isc/tests/mem_test.c b/lib/isc/tests/mem_test.c index 9e35af3fc7..b61ed7cb01 100644 --- a/lib/isc/tests/mem_test.c +++ b/lib/isc/tests/mem_test.c @@ -12,6 +12,7 @@ #if HAVE_CMOCKA #include +#include #include /* IWYU pragma: keep */ #include #include @@ -148,6 +149,45 @@ isc_mem_test(void **state) { isc_mempool_destroy(&mp1); } +#if defined(HAVE_MALLOC_NP_H) || defined(HAVE_JEMALLOC) +/* aligned memory system tests */ +static void +isc_mem_aligned_test(void **state) { + isc_mem_t *mctx2 = NULL; + void *ptr; + size_t alignment; + uintptr_t aligned; + + UNUSED(state); + + /* Check different alignment sizes up to the page size */ + for (alignment = sizeof(void *); alignment <= 4096; alignment *= 2) { + size_t size = alignment / 2 - 1; + ptr = isc_mem_get_aligned(test_mctx, size, alignment); + + /* Check if the pointer is properly aligned */ + aligned = (((uintptr_t)ptr / alignment) * alignment); + assert_ptr_equal(aligned, (uintptr_t)ptr); + + /* Check if we can resize to range */ + ptr = isc_mem_reget_aligned(test_mctx, ptr, size, + size * 2 + alignment, alignment); + + /* Check if the pointer is still properly aligned */ + aligned = (((uintptr_t)ptr / alignment) * alignment); + assert_ptr_equal(aligned, (uintptr_t)ptr); + + isc_mem_put_aligned(test_mctx, ptr, size * 2 + alignment, + alignment); + + /* Check whether isc_mem_putanddetach_detach() also works */ + isc_mem_create(&mctx2); + ptr = isc_mem_get_aligned(mctx2, size, alignment); + isc_mem_putanddetach_aligned(&mctx2, ptr, size, alignment); + } +} +#endif /* defined(HAVE_MALLOC_NP_H) || defined(HAVE_JEMALLOC) */ + /* test TotalUse calculation */ static void isc_mem_total_test(void **state) { @@ -474,7 +514,8 @@ isc_mem_benchmark(void **state) { t = isc_time_microdiff(&ts2, &ts1); printf("[ TIME ] isc_mem_benchmark: " - "%d isc_mem_{get,put} calls, %f seconds, %f calls/second\n", + "%d isc_mem_{get,put} calls, %f seconds, %f " + "calls/second\n", nthreads * ITERS * NUM_ITEMS, t / 1000000.0, (nthreads * ITERS * NUM_ITEMS) / (t / 1000000.0)); } @@ -490,6 +531,10 @@ main(void) { const struct CMUnitTest tests[] = { cmocka_unit_test_setup_teardown(isc_mem_test, _setup, _teardown), +#if defined(HAVE_MALLOC_NP_H) || defined(HAVE_JEMALLOC) + cmocka_unit_test_setup_teardown(isc_mem_aligned_test, _setup, + _teardown), +#endif /* defined(HAVE_MALLOC_NP_H) || defined(HAVE_JEMALLOC) */ cmocka_unit_test_setup_teardown(isc_mem_total_test, _setup, _teardown), cmocka_unit_test_setup_teardown(isc_mem_inuse_test, _setup,