diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst
index f43819be0..2e8cf5edb 100644
--- a/Documentation/topics/dpdk/pmd.rst
+++ b/Documentation/topics/dpdk/pmd.rst
@@ -353,10 +353,6 @@ and can differ significantly depending on system configuration. The actual
time not processing packets will be determined by the sleep and processor
wake-up times and should be tested with each system configuration.
-The current configuration of the PMD load based sleeping can be shown with::
-
- $ ovs-appctl dpif-netdev/pmd-sleep-show
-
Sleep time statistics for 10 secs can be seen with::
$ ovs-appctl dpif-netdev/pmd-stats-clear \
@@ -379,5 +375,35 @@ system configuration (e.g. enabling processor C-states) and workloads.
extra latency before the PMD thread returns to processing packets at full
rate.
+Maximum sleep values can also be set for individual PMD threads using
+key:value pairs in the form of core:max_sleep. Any PMD thread that has been
+assigned a specified value will use that. Any PMD thread that does not have
+a specified value will use the current global value.
+
+Specified values for individual PMD threads can be added or removed at
+any time.
+
+For example, to set PMD threads on cores 8 and 9 to never request a load based
+sleep and all others PMD threads to be able to request a max sleep of
+50 microseconds (us)::
+
+ $ ovs-vsctl set open_vswitch . other_config:pmd-sleep-max=50,8:0,9:0
+
+The max sleep value for each PMD thread can be checked in the logs or with::
+
+ $ ovs-appctl dpif-netdev/pmd-sleep-show
+ pmd thread numa_id 0 core_id 8:
+ max sleep: 0 us
+ pmd thread numa_id 1 core_id 9:
+ max sleep: 0 us
+ pmd thread numa_id 0 core_id 10:
+ max sleep: 50 us
+ pmd thread numa_id 1 core_id 11:
+ max sleep: 50 us
+ pmd thread numa_id 0 core_id 12:
+ max sleep: 50 us
+ pmd thread numa_id 1 core_id 13:
+ max sleep: 50 us
+
.. _ovs-vswitchd(8):
http://openvswitch.org/support/dist-docs/ovs-vswitchd.8.html
diff --git a/NEWS b/NEWS
index 591d5e470..270ed6673 100644
--- a/NEWS
+++ b/NEWS
@@ -32,6 +32,10 @@ Post-v3.2.0
* Added support for Generic Segmentation Offloading for the cases where
TSO is enabled but not supported by an egress interface (except for
tunnel interfaces).
+ * 'pmd-sleep-max' is updated to also accept pmd-thread-core:sleep-max.
+ The existing behaviour is maintained and a non key:value pair value
+ will be applied to all other PMD thread cores.'pmd-sleep-show' is
+ updated to show the maximum sleep for each PMD thread core.
v3.2.0 - 17 Aug 2023
diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index 1ec3cd794..8715b3837 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -180,6 +180,9 @@ struct dp_netdev_pmd_thread {
int numa_id; /* numa node id of this pmd thread. */
bool isolated;
+ /* Max sleep request in microseconds. */
+ atomic_uint64_t max_sleep;
+
/* Queue id used by this pmd thread to send packets on all netdevs if
* XPS disabled for this netdev. All static_tx_qid's are unique and less
* than 'cmap_count(dp->poll_threads)'. */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 9a59a1b03..0aea9d0b8 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -179,6 +179,11 @@ static struct odp_support dp_netdev_support = {
/* Time in uS to increment a pmd thread sleep time. */
#define PMD_SLEEP_INC_US 1
+struct pmd_sleep {
+ unsigned core_id;
+ uint64_t max_sleep;
+};
+
struct dpcls {
struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
odp_port_t in_port;
@@ -287,8 +292,8 @@ struct dp_netdev {
atomic_uint32_t emc_insert_min;
/* Enable collection of PMD performance metrics. */
atomic_bool pmd_perf_metrics;
- /* Max load based sleep request. */
- atomic_uint64_t pmd_max_sleep;
+ /* Default max load based sleep request. */
+ uint64_t pmd_max_sleep_default;
/* Enable the SMC cache from ovsdb config */
atomic_bool smc_enable_db;
@@ -326,6 +331,9 @@ struct dp_netdev {
/* Cpu mask for pin of pmd threads. */
char *pmd_cmask;
+ /* PMD max load based sleep request user string. */
+ char *max_sleep_list;
+
uint64_t last_tnl_conf_seq;
struct conntrack *conntrack;
@@ -1428,6 +1436,19 @@ dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
ds_destroy(&reply);
}
+static void
+pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id,
+ uint64_t pmd_max_sleep)
+{
+ if (core_id == NON_PMD_CORE_ID) {
+ return;
+ }
+ ds_put_format(reply,
+ "pmd thread numa_id %d core_id %d:\n"
+ " max sleep: %4"PRIu64" us\n",
+ numa_id, core_id, pmd_max_sleep);
+}
+
static void
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
void *aux)
@@ -1442,9 +1463,8 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
unsigned int secs = 0;
unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX)
/ INTERVAL_USEC_TO_SEC;
- uint64_t default_max_sleep = 0;
bool show_header = true;
-
+ uint64_t max_sleep;
ovs_mutex_lock(&dp_netdev_mutex);
@@ -1512,12 +1532,13 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
} else if (type == PMD_INFO_SLEEP_SHOW) {
if (show_header) {
- atomic_read_relaxed(&dp->pmd_max_sleep, &default_max_sleep);
- ds_put_format(&reply, "Default max sleep: %4"PRIu64" us",
- default_max_sleep);
- ds_put_cstr(&reply, "\n");
+ ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n",
+ dp->pmd_max_sleep_default);
show_header = false;
}
+ atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
+ pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id,
+ max_sleep);
}
}
free(pmd_list);
@@ -1906,6 +1927,8 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
return error;
}
+ dp->max_sleep_list = NULL;
+
dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
*dpp = dp;
return 0;
@@ -2015,6 +2038,7 @@ dp_netdev_free(struct dp_netdev *dp)
dp_netdev_meter_destroy(dp);
+ free(dp->max_sleep_list);
free(dp->pmd_cmask);
free(CONST_CAST(char *, dp->name));
free(dp);
@@ -4847,6 +4871,209 @@ set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log)
}
}
+static int
+parse_pmd_sleep_list(const char *max_sleep_list,
+ struct pmd_sleep **pmd_sleeps)
+{
+ char *list, *copy, *key, *value;
+ int num_vals = 0;
+
+ if (!max_sleep_list) {
+ return num_vals;
+ }
+
+ list = copy = xstrdup(max_sleep_list);
+
+ while (ofputil_parse_key_value(&list, &key, &value)) {
+ uint64_t temp, pmd_max_sleep;
+ char *error = NULL;
+ unsigned core;
+ int i;
+
+ error = str_to_u64(key, &temp);
+ if (error) {
+ free(error);
+ continue;
+ }
+
+ if (value[0] == '\0') {
+ /* No value specified. key is dp default. */
+ core = UINT_MAX;
+ pmd_max_sleep = temp;
+ } else {
+ error = str_to_u64(value, &pmd_max_sleep);
+ if (!error && temp < UINT_MAX) {
+ /* Key is pmd core id. */
+ core = (unsigned) temp;
+ } else {
+ free(error);
+ continue;
+ }
+ }
+
+ /* Detect duplicate max sleep values. */
+ for (i = 0; i < num_vals; i++) {
+ if ((*pmd_sleeps)[i].core_id == core) {
+ break;
+ }
+ }
+ if (i == num_vals) {
+ /* Not duplicate, add a new entry. */
+ *pmd_sleeps = xrealloc(*pmd_sleeps,
+ (num_vals + 1) * sizeof **pmd_sleeps);
+ num_vals++;
+ }
+
+ pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
+
+ (*pmd_sleeps)[i].core_id = core;
+ (*pmd_sleeps)[i].max_sleep = pmd_max_sleep;
+ }
+
+ free(copy);
+ return num_vals;
+}
+
+static void
+log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep)
+{
+ if (core_id == NON_PMD_CORE_ID) {
+ return;
+ }
+ VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, "
+ "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep);
+}
+
+static void
+pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
+{
+ uint64_t max_sleep = dp->pmd_max_sleep_default;
+ struct pmd_sleep *pmd_sleeps = NULL;
+ int num_vals;
+
+ num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps);
+
+ /* Check if the user has set a specific value for this pmd. */
+ for (int i = 0; i < num_vals; i++) {
+ if (pmd_sleeps[i].core_id == pmd->core_id) {
+ max_sleep = pmd_sleeps[i].max_sleep;
+ break;
+ }
+ }
+ atomic_init(&pmd->max_sleep, max_sleep);
+ log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep);
+ free(pmd_sleeps);
+}
+
+static bool
+assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals,
+ struct pmd_sleep *pmd_sleeps)
+{
+ struct dp_netdev_pmd_thread *pmd;
+ bool value_changed = false;
+
+ CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+ uint64_t new_max_sleep, cur_pmd_max_sleep;
+
+ if (pmd->core_id == NON_PMD_CORE_ID) {
+ continue;
+ }
+
+ /* Default to global value. */
+ new_max_sleep = dp->pmd_max_sleep_default;
+
+ /* Check for pmd specific value. */
+ for (int i = 0; i < num_vals; i++) {
+ if (pmd->core_id == pmd_sleeps[i].core_id) {
+ new_max_sleep = pmd_sleeps[i].max_sleep;
+ break;
+ }
+ }
+ atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
+ if (new_max_sleep != cur_pmd_max_sleep) {
+ atomic_store_relaxed(&pmd->max_sleep, new_max_sleep);
+ value_changed = true;
+ }
+ }
+ return value_changed;
+}
+
+static void
+log_all_pmd_sleeps(struct dp_netdev *dp)
+{
+ struct dp_netdev_pmd_thread **pmd_list = NULL;
+ struct dp_netdev_pmd_thread *pmd;
+ size_t n;
+
+ VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.",
+ dp->pmd_max_sleep_default);
+
+ sorted_poll_thread_list(dp, &pmd_list, &n);
+
+ for (size_t i = 0; i < n; i++) {
+ uint64_t cur_pmd_max_sleep;
+
+ pmd = pmd_list[i];
+ atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
+ log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep);
+ }
+ free(pmd_list);
+}
+
+static bool
+set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config)
+{
+ const char *max_sleep_list = smap_get(config, "pmd-sleep-max");
+ struct pmd_sleep *pmd_sleeps = NULL;
+ uint64_t default_max_sleep = 0;
+ bool default_changed = false;
+ bool pmd_changed = false;
+ uint64_t pmd_maxsleep;
+ int num_vals = 0;
+
+ /* Check for deprecated 'pmd-maxsleep' value. */
+ pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX);
+ if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) {
+ VLOG_WARN_ONCE("pmd-maxsleep is deprecated. "
+ "Please use pmd-sleep-max instead.");
+ default_max_sleep = pmd_maxsleep;
+ }
+
+ /* Check if there is no change in string or value. */
+ if (!!dp->max_sleep_list == !!max_sleep_list) {
+ if (max_sleep_list
+ ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list)
+ : default_max_sleep == dp->pmd_max_sleep_default) {
+ return false;
+ }
+ }
+
+ /* Free existing string and copy new one (if any). */
+ free(dp->max_sleep_list);
+ dp->max_sleep_list = nullable_xstrdup(max_sleep_list);
+
+ if (max_sleep_list) {
+ num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps);
+
+ /* Check if the user has set a global value. */
+ for (int i = 0; i < num_vals; i++) {
+ if (pmd_sleeps[i].core_id == UINT_MAX) {
+ default_max_sleep = pmd_sleeps[i].max_sleep;
+ break;
+ }
+ }
+ }
+
+ if (dp->pmd_max_sleep_default != default_max_sleep) {
+ dp->pmd_max_sleep_default = default_max_sleep;
+ default_changed = true;
+ }
+ pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps);
+
+ free(pmd_sleeps);
+ return default_changed || pmd_changed;
+}
+
/* Applies datapath configuration from the database. Some of the changes are
* actually applied in dpif_netdev_run(). */
static int
@@ -4864,7 +5091,6 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
uint64_t rebalance_intvl;
uint8_t cur_rebalance_load;
uint32_t rebalance_load, rebalance_improve;
- uint64_t pmd_max_sleep, cur_pmd_max_sleep;
bool log_autolb = false;
enum sched_assignment_type pmd_rxq_assign_type;
static bool first_set_config = true;
@@ -5015,26 +5241,12 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
set_pmd_auto_lb(dp, autolb_state, log_autolb);
- pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", UINT64_MAX);
- if (pmd_max_sleep != UINT64_MAX) {
- VLOG_WARN("pmd-maxsleep is deprecated. "
- "Please use pmd-sleep-max instead.");
- } else {
- pmd_max_sleep = 0;
+ bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config);
+ if (first_set_config || sleep_changed) {
+ log_all_pmd_sleeps(dp);
}
- pmd_max_sleep = smap_get_ullong(other_config, "pmd-sleep-max",
- pmd_max_sleep);
- pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
- atomic_read_relaxed(&dp->pmd_max_sleep, &cur_pmd_max_sleep);
- if (first_set_config || pmd_max_sleep != cur_pmd_max_sleep) {
- atomic_store_relaxed(&dp->pmd_max_sleep, pmd_max_sleep);
- VLOG_INFO("PMD max sleep request is %"PRIu64" usecs.", pmd_max_sleep);
- VLOG_INFO("PMD load based sleeps are %s.",
- pmd_max_sleep ? "enabled" : "disabled" );
- }
-
- first_set_config = false;
+ first_set_config = false;
return 0;
}
@@ -7063,7 +7275,7 @@ reload:
pmd_perf_start_iteration(s);
atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db);
- atomic_read_relaxed(&pmd->dp->pmd_max_sleep, &max_sleep);
+ atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
for (i = 0; i < poll_cnt; i++) {
@@ -7650,6 +7862,8 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
hmap_init(&pmd->send_port_cache);
cmap_init(&pmd->tx_bonds);
+ pmd_init_max_sleep(dp, pmd);
+
/* Initialize DPIF function pointer to the default configured version. */
atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default());
diff --git a/tests/pmd.at b/tests/pmd.at
index 06cc90477..35a44b4df 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -60,20 +60,32 @@ m4_define([CHECK_PMD_THREADS_CREATED], [
fi
])
-dnl CHECK_DP_SLEEP_MAX([max_sleep], [enabled], [+line])
+dnl CHECK_DP_SLEEP_MAX([max_sleep], [+line])
dnl
-dnl Checks correct pmd load based sleep is set for the datapath.
+dnl Checks correct pmd load based sleep value for the datapath.
dnl Checking starts from line number 'line' in ovs-vswithd.log .
m4_define([CHECK_DP_SLEEP_MAX], [
- SLEEP_TIME="PMD max sleep request is $1 usecs."
- SLEEP_STATE="PMD load based sleeps are $2."
- line_st=$3
+ SLEEP_TIME="Default PMD thread max sleep: *[$1] us."
+ line_st=$2
if [[ -z "$line_st" ]]
then
line_st="+0"
fi
OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_TIME"])
- OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_STATE"])
+])
+
+dnl CHECK_PMD_SLEEP_MAX([core_id], [numa_id], [max_sleep], [+line])
+dnl
+dnl Checks max sleep time of each pmd with core_id.
+dnl Checking starts from line number 'line' in ovs-vswithd.log .
+m4_define([CHECK_PMD_SLEEP_MAX], [
+ PATTERN="PMD thread on numa_id: *[$1], core id: *[$2], max sleep: *[$3] us."
+ line_st=$4
+ if [[ -z "$line_st" ]]
+ then
+ line_st="+0"
+ fi
+ OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$PATTERN"])
])
m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\125%
.
Specifies the maximum sleep time that will be requested in microseconds per iteration for a PMD thread which has received zero @@ -823,6 +821,33 @@
The maximum value is 10000 microseconds
.
+ other_config:pmd-sleep-max=<pmd-sleep-list>
+
where
++