2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 18:07:40 +00:00

ovs-numa: Support non-contiguous numa nodes and offline CPU cores.

This change removes the assumption that numa nodes and cores are numbered
contiguously in linux.  This change is required to support some Power
systems.

A check has been added to verify that cores are online,
offline cores result in non-contiguously numbered cores.

DPDK EAL option generation is updated to work with non-contiguous numa nodes.
These options can be seen in the ovs-vswitchd.log.  For example:
a system containing only numa nodes 0 and 8 will generate the following:

EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \
                       --socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0

Tests for pmd and dpif-netdev have been updated to validate non-contiguous
numbered nodes.

Signed-off-by: David Wilder <dwilder@us.ibm.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
This commit is contained in:
David Wilder 2021-06-22 11:53:08 -07:00 committed by Ilya Maximets
parent 154983c592
commit 3da3cc1a0c
6 changed files with 144 additions and 32 deletions

1
NEWS
View File

@ -16,6 +16,7 @@ Post-v2.15.0
* Auto load balancing of PMDs now partially supports cross-NUMA polling * Auto load balancing of PMDs now partially supports cross-NUMA polling
cases, e.g if all PMD threads are running on the same NUMA node. cases, e.g if all PMD threads are running on the same NUMA node.
* Userspace datapath now supports up to 2^18 meters. * Userspace datapath now supports up to 2^18 meters.
* Added support for systems with non-contiguous NUMA nodes and core ids.
- ovs-ctl: - ovs-ctl:
* New option '--no-record-hostname' to disable hostname configuration * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup. in ovsdb on startup.

View File

@ -130,22 +130,63 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
} }
} }
static int
compare_numa_node_list(const void *a_, const void *b_)
{
int a = *(const int *) a_;
int b = *(const int *) b_;
if (a < b) {
return -1;
}
if (a > b) {
return 1;
}
return 0;
}
static char * static char *
construct_dpdk_socket_mem(void) construct_dpdk_socket_mem(void)
{ {
const char *def_value = "1024"; const char *def_value = "1024";
int numa, numa_nodes = ovs_numa_get_n_numas();
struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER; struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER;
if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) { /* Build a list of all numa nodes with at least one core. */
numa_nodes = 1; struct ovs_numa_dump *dump = ovs_numa_dump_n_cores_per_numa(1);
} size_t n_numa_nodes = hmap_count(&dump->numas);
int *numa_node_list = xcalloc(n_numa_nodes, sizeof *numa_node_list);
ds_put_cstr(&dpdk_socket_mem, def_value); const struct ovs_numa_info_numa *node;
for (numa = 1; numa < numa_nodes; ++numa) { int k = 0, last_node = 0;
FOR_EACH_NUMA_ON_DUMP(node, dump) {
if (k >= n_numa_nodes) {
break;
}
numa_node_list[k++] = node->numa_id;
}
qsort(numa_node_list, k, sizeof *numa_node_list, compare_numa_node_list);
for (int i = 0; i < n_numa_nodes; i++) {
while (numa_node_list[i] > last_node &&
numa_node_list[i] != OVS_NUMA_UNSPEC &&
numa_node_list[i] <= MAX_NUMA_NODES) {
if (last_node == 0) {
ds_put_format(&dpdk_socket_mem, "%s", "0");
} else {
ds_put_format(&dpdk_socket_mem, ",%s", "0");
}
last_node++;
}
if (numa_node_list[i] == 0) {
ds_put_format(&dpdk_socket_mem, "%s", def_value);
} else {
ds_put_format(&dpdk_socket_mem, ",%s", def_value); ds_put_format(&dpdk_socket_mem, ",%s", def_value);
} }
last_node++;
}
free(numa_node_list);
ovs_numa_dump_destroy(dump);
return ds_cstr(&dpdk_socket_mem); return ds_cstr(&dpdk_socket_mem);
} }

View File

@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
* This module stores the affinity information of numa nodes and cpu cores. * This module stores the affinity information of numa nodes and cpu cores.
* It also provides functions to bookkeep the pin of threads on cpu cores. * It also provides functions to bookkeep the pin of threads on cpu cores.
* *
* It is assumed that the numa node ids and cpu core ids all start from 0 and * It is assumed that the numa node ids and cpu core ids all start from 0.
* range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns N, * There is no guarantee that node and cpu ids are numbered consecutively
* user can assume core ids from 0 to N-1 are all valid and there is a * So, for example, if two nodes exist with ids 0 and 8,
* 'struct cpu_core' for each id. * 'ovs_numa_get_n_nodes()' will return 2, no assumption of node numbering
* should be made.
* *
* NOTE, this module should only be used by the main thread. * NOTE, this module should only be used by the main thread.
* *
* NOTE, the assumption above will fail when cpu hotplug is used. In that * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
* case ovs-numa will not function correctly. For now, add a TODO entry * invalidated when ever the system topology changes. Support for detecting
* for addressing it in the future. * topology changes has not been included. For now, add a TODO entry for
* addressing it in the future.
* *
* TODO: Fix ovs-numa when cpu hotplug is used. * TODO: Fix ovs-numa when cpu hotplug is used.
*/ */
#define MAX_NUMA_NODES 128
/* numa node. */ /* numa node. */
struct numa_node { struct numa_node {
@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id)
* - "0,0,0,0": four cores on numa socket 0. * - "0,0,0,0": four cores on numa socket 0.
* - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets. * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
* - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets. * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
* * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
* The different numa ids must be consecutives or the function will abort. */ */
static void static void
discover_numa_and_core_dummy(void) discover_numa_and_core_dummy(void)
{ {
char *conf = xstrdup(dummy_config); char *conf = xstrdup(dummy_config);
char *id, *saveptr = NULL; char *id, *saveptr = NULL;
unsigned i = 0; unsigned i = 0;
long max_numa_id = 0;
for (id = strtok_r(conf, ",", &saveptr); id; for (id = strtok_r(conf, ",", &saveptr); id;
id = strtok_r(NULL, ",", &saveptr)) { id = strtok_r(NULL, ",", &saveptr)) {
@ -152,8 +152,6 @@ discover_numa_and_core_dummy(void)
continue; continue;
} }
max_numa_id = MAX(max_numa_id, numa_id);
hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0)); hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0));
if (hnode) { if (hnode) {
@ -169,11 +167,28 @@ discover_numa_and_core_dummy(void)
free(conf); free(conf);
if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
ovs_fatal(0, "dummy numa contains non consecutive numa ids");
}
} }
#ifdef __linux__
/* Check if a CPU is detected and online. */
static int
cpu_detected(unsigned int core_id)
{
char path[PATH_MAX];
int len = snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/topology/core_id",
core_id);
if (len <= 0 || (unsigned) len >= sizeof(path)) {
return 0;
}
if (access(path, F_OK) != 0) {
return 0;
}
return 1;
}
#endif /* __linux__ */
/* Discovers all numa nodes and the corresponding cpu cores. /* Discovers all numa nodes and the corresponding cpu cores.
* Constructs the 'struct numa_node' and 'struct cpu_core'. */ * Constructs the 'struct numa_node' and 'struct cpu_core'. */
static void static void
@ -219,9 +234,11 @@ discover_numa_and_core(void)
unsigned core_id; unsigned core_id;
core_id = strtoul(subdir->d_name + 3, NULL, 10); core_id = strtoul(subdir->d_name + 3, NULL, 10);
if (cpu_detected(core_id)) {
insert_new_cpu_core(n, core_id); insert_new_cpu_core(n, core_id);
} }
} }
}
closedir(dir); closedir(dir);
} else if (errno != ENOENT) { } else if (errno != ENOENT) {
VLOG_WARN("opendir(%s) failed (%s)", path, VLOG_WARN("opendir(%s) failed (%s)", path,
@ -229,7 +246,7 @@ discover_numa_and_core(void)
} }
free(path); free(path);
if (!dir || !numa_supported) { if (!numa_supported) {
break; break;
} }
} }

View File

@ -26,6 +26,8 @@
#define OVS_CORE_UNSPEC INT_MAX #define OVS_CORE_UNSPEC INT_MAX
#define OVS_NUMA_UNSPEC INT_MAX #define OVS_NUMA_UNSPEC INT_MAX
#define MAX_NUMA_NODES 128
/* Dump of a list of 'struct ovs_numa_info'. */ /* Dump of a list of 'struct ovs_numa_info'. */
struct ovs_numa_dump { struct ovs_numa_dump {
struct hmap cores; struct hmap cores;

View File

@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE],
fail-mode=secure -- \ fail-mode=secure -- \
add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \ add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \
add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [], add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [],
[m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])]) [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=normal]) AT_CHECK([ovs-ofctl add-flow br0 action=normal])

View File

@ -361,8 +361,8 @@ AT_SETUP([PMD - change numa node])
OVS_VSWITCHD_START( OVS_VSWITCHD_START(
[add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \ [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \ add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
set Open_vSwitch . other_config:pmd-cpu-mask=3 set Open_vSwitch . other_config:pmd-cpu-mask=7
], [], [], [--dummy-numa 0,1]) ], [], [], [--dummy-numa 0,1,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=controller]) AT_CHECK([ovs-ofctl add-flow br0 action=controller])
@ -432,6 +432,40 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
]) ])
AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 8 2
p1 1 8 2
p2 0 1 1
p2 1 1 1
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
OVS_VSWITCHD_STOP OVS_VSWITCHD_STOP
AT_CLEANUP AT_CLEANUP
@ -584,7 +618,7 @@ AT_CLEANUP
AT_SETUP([PMD - rxq affinity - NUMA]) AT_SETUP([PMD - rxq affinity - NUMA])
OVS_VSWITCHD_START( OVS_VSWITCHD_START(
[], [], [], [--dummy-numa 0,0,0,1,1]) [], [], [], [--dummy-numa 0,0,0,1,1,8,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=controller]) AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
@ -601,21 +635,38 @@ p1 1 0 2
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"]) AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
dnl We moved the queues to different numa node. Expecting threads on dnl We moved the queues to different contiguous numa node. Expecting threads on
dnl NUMA node 1 to be created. dnl NUMA node 1 to be created.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3 p1 0 1 3
p1 1 1 4 p1 1 1 4
]) ])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"])
dnl We moved the queues to different non-contiguous numa node. Expecting threads on
dnl NUMA node 8 to be created.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 8 5
p1 1 8 6
])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"]) AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
dnl Queues splitted between NUMA nodes. dnl Queues splitted between contiguous NUMA nodes.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3 p1 0 1 3
p1 1 0 1 p1 1 0 1
]) ])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:1"])
dnl Queues splitted between non-contiguous NUMA nodes.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 8 5
p1 1 0 1
])
AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity]) AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity])
dnl We removed the rxq-affinity request. dpif-netdev should assign queues dnl We removed the rxq-affinity request. dpif-netdev should assign queues