2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-31 06:15:47 +00:00
Files
ovs/lib/ovs-numa.c
Michael Santana 2803b3fb53 handlers: Fix handlers mapping.
The handler and CPU mapping in upcalls are incorrect, and this is
specially noticeable systems with cpu isolation enabled.

Say we have a 12 core system where only every even number CPU is enabled
C0, C2, C4, C6, C8, C10

This means we will create an array of size 6 that will be sent to
kernel that is populated with sockets [S0, S1, S2, S3, S4, S5]

The problem is when the kernel does an upcall it checks the socket array
via the index of the CPU, effectively adding additional load on some
CPUs while leaving no work on other CPUs.

e.g.

C0  indexes to S0
C2  indexes to S2 (should be S1)
C4  indexes to S4 (should be S2)

Modulo of 6 (size of socket array) is applied, so we wrap back to S0
C6  indexes to S0 (should be S3)
C8  indexes to S2 (should be S4)
C10 indexes to S4 (should be S5)

Effectively sockets S0, S2, S4 get overloaded while sockets S1, S3, S5
get no work assigned to them

This leads to the kernel to throw the following message:
"openvswitch: cpu_id mismatch with handler threads"

Instead we will send the kernel a corrected array of sockets the size
of all CPUs in the system, or the largest core_id on the system, which
ever one is greatest. This is to take care of systems with non-continous
core cpus.

In the above example we would create a
corrected array in a round-robin(assuming prime bias) fashion as follows:
[S0, S1, S2, S3, S4, S5, S6, S0, S1, S2, S3, S4]

Fixes: b1e517bd2f ("dpif-netlink: Introduce per-cpu upcall dispatch.")
Co-authored-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Michael Santana <msantana@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-08-15 19:43:57 +02:00

667 lines
17 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2014 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "ovs-numa.h"
#include <ctype.h>
#include <errno.h>
#ifdef __linux__
#include <dirent.h>
#include <stddef.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#endif /* __linux__ */
#include "hash.h"
#include "openvswitch/hmap.h"
#include "openvswitch/list.h"
#include "ovs-thread.h"
#include "openvswitch/vlog.h"
#include "util.h"
VLOG_DEFINE_THIS_MODULE(ovs_numa);
/* ovs-numa module
* ===============
*
* This module stores the affinity information of numa nodes and cpu cores.
* It also provides functions to bookkeep the pin of threads on cpu cores.
*
* It is assumed that the numa node ids and cpu core ids all start from 0.
* There is no guarantee that node and cpu ids are numbered consecutively
* So, for example, if two nodes exist with ids 0 and 8,
* 'ovs_numa_get_n_nodes()' will return 2, no assumption of node numbering
* should be made.
*
* NOTE, this module should only be used by the main thread.
*
* NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
* invalidated when ever the system topology changes. Support for detecting
* topology changes has not been included. For now, add a TODO entry for
* addressing it in the future.
*
* TODO: Fix ovs-numa when cpu hotplug is used.
*/
/* numa node. */
struct numa_node {
struct hmap_node hmap_node; /* In the 'all_numa_nodes'. */
struct ovs_list cores; /* List of cpu cores on the numa node. */
int numa_id; /* numa node id. */
};
/* Cpu core on a numa node. */
struct cpu_core {
struct hmap_node hmap_node;/* In the 'all_cpu_cores'. */
struct ovs_list list_node; /* In 'numa_node->cores' list. */
struct numa_node *numa; /* numa node containing the core. */
unsigned core_id; /* Core id. */
};
/* Contains all 'struct numa_node's. */
static struct hmap all_numa_nodes = HMAP_INITIALIZER(&all_numa_nodes);
/* Contains all 'struct cpu_core's. */
static struct hmap all_cpu_cores = HMAP_INITIALIZER(&all_cpu_cores);
/* True if numa node and core info are correctly extracted. */
static bool found_numa_and_core;
/* True if the module was initialized with dummy options. In this case, the
* module must not interact with the actual cpus/nodes in the system. */
static bool dummy_numa = false;
/* If 'dummy_numa' is true, contains a copy of the dummy numa configuration
* parameter */
static char *dummy_config;
static struct numa_node *get_numa_by_numa_id(int numa_id);
#ifdef __linux__
/* Returns true if 'str' contains all digits. Returns false otherwise. */
static bool
contain_all_digits(const char *str)
{
return str[strspn(str, "0123456789")] == '\0';
}
#endif /* __linux__ */
static struct numa_node *
insert_new_numa_node(int numa_id)
{
struct numa_node *n = xzalloc(sizeof *n);
hmap_insert(&all_numa_nodes, &n->hmap_node, hash_int(numa_id, 0));
ovs_list_init(&n->cores);
n->numa_id = numa_id;
return n;
}
static struct cpu_core *
insert_new_cpu_core(struct numa_node *n, unsigned core_id)
{
struct cpu_core *c = xzalloc(sizeof *c);
hmap_insert(&all_cpu_cores, &c->hmap_node, hash_int(core_id, 0));
ovs_list_insert(&n->cores, &c->list_node);
c->core_id = core_id;
c->numa = n;
return c;
}
/* Has the same effect as discover_numa_and_core(), but instead of
* reading sysfs entries, extracts the info from the global variable
* 'dummy_config', which is set with ovs_numa_set_dummy().
*
* 'dummy_config' lists the numa_ids of each CPU separated by a comma, e.g.
* - "0,0,0,0": four cores on numa socket 0.
* - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
* - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
* - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
*/
static void
discover_numa_and_core_dummy(void)
{
char *conf = xstrdup(dummy_config);
char *id, *saveptr = NULL;
unsigned i = 0;
for (id = strtok_r(conf, ",", &saveptr); id;
id = strtok_r(NULL, ",", &saveptr)) {
struct hmap_node *hnode;
struct numa_node *n;
long numa_id;
numa_id = strtol(id, NULL, 10);
if (numa_id < 0 || numa_id >= MAX_NUMA_NODES) {
VLOG_WARN("Invalid numa node %ld", numa_id);
continue;
}
hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0));
if (hnode) {
n = CONTAINER_OF(hnode, struct numa_node, hmap_node);
} else {
n = insert_new_numa_node(numa_id);
}
insert_new_cpu_core(n, i);
i++;
}
free(conf);
}
#ifdef __linux__
/* Check if a CPU is detected and online. */
static int
cpu_detected(unsigned int core_id)
{
char path[PATH_MAX];
int len = snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/topology/core_id",
core_id);
if (len <= 0 || (unsigned) len >= sizeof(path)) {
return 0;
}
if (access(path, F_OK) != 0) {
return 0;
}
return 1;
}
#endif /* __linux__ */
/* Discovers all numa nodes and the corresponding cpu cores.
* Constructs the 'struct numa_node' and 'struct cpu_core'. */
static void
discover_numa_and_core(void)
{
#ifdef __linux__
int i;
DIR *dir;
bool numa_supported = true;
/* Check if NUMA supported on this system. */
dir = opendir("/sys/devices/system/node");
if (!dir && errno == ENOENT) {
numa_supported = false;
}
if (dir) {
closedir(dir);
}
for (i = 0; i < MAX_NUMA_NODES; i++) {
char* path;
if (numa_supported) {
/* Constructs the path to node /sys/devices/system/nodeX. */
path = xasprintf("/sys/devices/system/node/node%d", i);
} else {
path = xasprintf("/sys/devices/system/cpu/");
}
dir = opendir(path);
/* Creates 'struct numa_node' if the 'dir' is non-null. */
if (dir) {
struct numa_node *n;
struct dirent *subdir;
n = insert_new_numa_node(i);
while ((subdir = readdir(dir)) != NULL) {
if (!strncmp(subdir->d_name, "cpu", 3)
&& contain_all_digits(subdir->d_name + 3)) {
unsigned core_id;
core_id = strtoul(subdir->d_name + 3, NULL, 10);
if (cpu_detected(core_id)) {
insert_new_cpu_core(n, core_id);
}
}
}
closedir(dir);
} else if (errno != ENOENT) {
VLOG_WARN("opendir(%s) failed (%s)", path,
ovs_strerror(errno));
}
free(path);
if (!numa_supported) {
break;
}
}
#endif /* __linux__ */
}
/* Gets 'struct cpu_core' by 'core_id'. */
static struct cpu_core*
get_core_by_core_id(unsigned core_id)
{
struct cpu_core *core;
HMAP_FOR_EACH_WITH_HASH (core, hmap_node, hash_int(core_id, 0),
&all_cpu_cores) {
if (core->core_id == core_id) {
return core;
}
}
return NULL;
}
/* Gets 'struct numa_node' by 'numa_id'. */
static struct numa_node*
get_numa_by_numa_id(int numa_id)
{
struct numa_node *numa;
HMAP_FOR_EACH_WITH_HASH (numa, hmap_node, hash_int(numa_id, 0),
&all_numa_nodes) {
if (numa->numa_id == numa_id) {
return numa;
}
}
return NULL;
}
/* Initializes the numa module. */
void
ovs_numa_init(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
if (ovsthread_once_start(&once)) {
const struct numa_node *n;
if (dummy_numa) {
discover_numa_and_core_dummy();
} else {
discover_numa_and_core();
}
HMAP_FOR_EACH(n, hmap_node, &all_numa_nodes) {
VLOG_INFO("Discovered %"PRIuSIZE" CPU cores on NUMA node %d",
ovs_list_size(&n->cores), n->numa_id);
}
VLOG_INFO("Discovered %"PRIuSIZE" NUMA nodes and %"PRIuSIZE" CPU cores",
hmap_count(&all_numa_nodes), hmap_count(&all_cpu_cores));
if (hmap_count(&all_numa_nodes) && hmap_count(&all_cpu_cores)) {
found_numa_and_core = true;
}
ovsthread_once_done(&once);
}
}
/* Extracts the numa node and core info from the 'config'. This is useful for
* testing purposes. The function must be called once, before ovs_numa_init().
*
* The format of 'config' is explained in the comment above
* discover_numa_and_core_dummy().*/
void
ovs_numa_set_dummy(const char *config)
{
dummy_numa = true;
ovs_assert(config);
free(dummy_config);
dummy_config = xstrdup(config);
}
bool
ovs_numa_numa_id_is_valid(int numa_id)
{
return found_numa_and_core && numa_id < ovs_numa_get_n_numas();
}
bool
ovs_numa_core_id_is_valid(unsigned core_id)
{
return found_numa_and_core && core_id < ovs_numa_get_n_cores();
}
/* Returns the number of numa nodes. */
int
ovs_numa_get_n_numas(void)
{
return found_numa_and_core ? hmap_count(&all_numa_nodes)
: OVS_NUMA_UNSPEC;
}
/* Returns the number of cpu cores. */
int
ovs_numa_get_n_cores(void)
{
return found_numa_and_core ? hmap_count(&all_cpu_cores)
: OVS_CORE_UNSPEC;
}
/* Given 'core_id', returns the corresponding numa node id. Returns
* OVS_NUMA_UNSPEC if 'core_id' is invalid. */
int
ovs_numa_get_numa_id(unsigned core_id)
{
struct cpu_core *core = get_core_by_core_id(core_id);
if (core) {
return core->numa->numa_id;
}
return OVS_NUMA_UNSPEC;
}
/* Returns the number of cpu cores on numa node. Returns OVS_CORE_UNSPEC
* if 'numa_id' is invalid. */
int
ovs_numa_get_n_cores_on_numa(int numa_id)
{
struct numa_node *numa = get_numa_by_numa_id(numa_id);
if (numa) {
return ovs_list_size(&numa->cores);
}
return OVS_CORE_UNSPEC;
}
/* Returns the largest core_id.
*
* Return OVS_CORE_UNSPEC, if core_id information is not found.
*
* Returning OVS_CORE_UNSPEC comes at a caveat. The caller function
* must remember to check the return value of this callee function
* against OVS_CORE_UNSPEC. OVS_CORE_UNSPEC is a positive integer
* INT_MAX, which the caller may interpret it as the largest
* core_id if it's not checking for it.
*/
unsigned
ovs_numa_get_largest_core_id(void)
{
struct cpu_core *core;
unsigned max_id = 0;
if (!found_numa_and_core) {
return OVS_CORE_UNSPEC;
}
HMAP_FOR_EACH (core, hmap_node, &all_cpu_cores) {
if (core->core_id > max_id) {
max_id = core->core_id;
}
}
return max_id;
}
static struct ovs_numa_dump *
ovs_numa_dump_create(void)
{
struct ovs_numa_dump *dump = xmalloc(sizeof *dump);
hmap_init(&dump->cores);
hmap_init(&dump->numas);
return dump;
}
static void
ovs_numa_dump_add(struct ovs_numa_dump *dump, int numa_id, int core_id)
{
struct ovs_numa_info_core *c = xzalloc(sizeof *c);
struct ovs_numa_info_numa *n;
c->numa_id = numa_id;
c->core_id = core_id;
hmap_insert(&dump->cores, &c->hmap_node, hash_2words(numa_id, core_id));
HMAP_FOR_EACH_WITH_HASH (n, hmap_node, hash_int(numa_id, 0),
&dump->numas) {
if (n->numa_id == numa_id) {
n->n_cores++;
return;
}
}
n = xzalloc(sizeof *n);
n->numa_id = numa_id;
n->n_cores = 1;
hmap_insert(&dump->numas, &n->hmap_node, hash_int(numa_id, 0));
}
/* Given the 'numa_id', returns dump of all cores on the numa node. */
struct ovs_numa_dump *
ovs_numa_dump_cores_on_numa(int numa_id)
{
struct ovs_numa_dump *dump = ovs_numa_dump_create();
struct numa_node *numa = get_numa_by_numa_id(numa_id);
if (numa) {
struct cpu_core *core;
LIST_FOR_EACH (core, list_node, &numa->cores) {
ovs_numa_dump_add(dump, numa->numa_id, core->core_id);
}
}
return dump;
}
struct ovs_numa_dump *
ovs_numa_dump_cores_with_cmask(const char *cmask)
{
struct ovs_numa_dump *dump = ovs_numa_dump_create();
int core_id = 0;
int end_idx;
/* Ignore leading 0x. */
end_idx = 0;
if (!strncmp(cmask, "0x", 2) || !strncmp(cmask, "0X", 2)) {
end_idx = 2;
}
for (int i = strlen(cmask) - 1; i >= end_idx; i--) {
char hex = cmask[i];
int bin;
bin = hexit_value(hex);
if (bin == -1) {
VLOG_WARN("Invalid cpu mask: %c", cmask[i]);
bin = 0;
}
for (int j = 0; j < 4; j++) {
if ((bin >> j) & 0x1) {
struct cpu_core *core = get_core_by_core_id(core_id);
if (core) {
ovs_numa_dump_add(dump,
core->numa->numa_id,
core->core_id);
}
}
core_id++;
}
}
return dump;
}
struct ovs_numa_dump *
ovs_numa_dump_n_cores_per_numa(int cores_per_numa)
{
struct ovs_numa_dump *dump = ovs_numa_dump_create();
const struct numa_node *n;
HMAP_FOR_EACH (n, hmap_node, &all_numa_nodes) {
const struct cpu_core *core;
int i = 0;
LIST_FOR_EACH (core, list_node, &n->cores) {
if (i++ >= cores_per_numa) {
break;
}
ovs_numa_dump_add(dump, core->numa->numa_id, core->core_id);
}
}
return dump;
}
bool
ovs_numa_dump_contains_core(const struct ovs_numa_dump *dump,
int numa_id, unsigned core_id)
{
struct ovs_numa_info_core *core;
HMAP_FOR_EACH_WITH_HASH (core, hmap_node, hash_2words(numa_id, core_id),
&dump->cores) {
if (core->core_id == core_id && core->numa_id == numa_id) {
return true;
}
}
return false;
}
size_t
ovs_numa_dump_count(const struct ovs_numa_dump *dump)
{
return hmap_count(&dump->cores);
}
void
ovs_numa_dump_destroy(struct ovs_numa_dump *dump)
{
struct ovs_numa_info_core *c;
struct ovs_numa_info_numa *n;
if (!dump) {
return;
}
HMAP_FOR_EACH_POP (c, hmap_node, &dump->cores) {
free(c);
}
HMAP_FOR_EACH_POP (n, hmap_node, &dump->numas) {
free(n);
}
hmap_destroy(&dump->cores);
hmap_destroy(&dump->numas);
free(dump);
}
struct ovs_numa_dump *
ovs_numa_thread_getaffinity_dump(void)
{
if (dummy_numa) {
/* Nothing to do. */
return NULL;
}
#ifndef __linux__
return NULL;
#else
struct ovs_numa_dump *dump;
const struct numa_node *n;
cpu_set_t cpuset;
int err;
CPU_ZERO(&cpuset);
err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset);
if (err) {
VLOG_ERR("Thread getaffinity error: %s", ovs_strerror(err));
return NULL;
}
dump = ovs_numa_dump_create();
HMAP_FOR_EACH (n, hmap_node, &all_numa_nodes) {
const struct cpu_core *core;
LIST_FOR_EACH (core, list_node, &n->cores) {
if (CPU_ISSET(core->core_id, &cpuset)) {
ovs_numa_dump_add(dump, core->numa->numa_id, core->core_id);
}
}
}
if (!ovs_numa_dump_count(dump)) {
ovs_numa_dump_destroy(dump);
return NULL;
}
return dump;
#endif /* __linux__ */
}
int
ovs_numa_thread_setaffinity_dump(const struct ovs_numa_dump *dump)
{
if (!dump || dummy_numa) {
/* Nothing to do. */
return 0;
}
#ifdef __linux__
const struct ovs_numa_info_core *core;
cpu_set_t cpuset;
int err;
CPU_ZERO(&cpuset);
FOR_EACH_CORE_ON_DUMP (core, dump) {
CPU_SET(core->core_id, &cpuset);
}
err = pthread_setaffinity_np(pthread_self(), sizeof cpuset, &cpuset);
if (err) {
VLOG_ERR("Thread setaffinity error: %s", ovs_strerror(err));
return err;
}
return 0;
#else /* !__linux__ */
return EOPNOTSUPP;
#endif /* __linux__ */
}
int ovs_numa_thread_setaffinity_core(unsigned core_id)
{
const struct cpu_core *core = get_core_by_core_id(core_id);
struct ovs_numa_dump *affinity = ovs_numa_dump_create();
int ret = EINVAL;
if (core) {
ovs_numa_dump_add(affinity, core->numa->numa_id, core->core_id);
ret = ovs_numa_thread_setaffinity_dump(affinity);
}
ovs_numa_dump_destroy(affinity);
return ret;
}