2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-31 06:15:47 +00:00

dpif-netdev: Calculate per numa variance.

Currently, pmd_rebalance_dry_run() calculate overall variance of
all pmds regardless of their numa location. The overall result may
hide un-balance in an individual numa.

Considering the following case. Numa0 is free because VMs on numa0
are not sending pkts, while numa1 is busy. Within numa1, pmds
workloads are not balanced. Obviously, moving 500 kpps workloads from
pmd 126 to pmd 62 will make numa1 much more balance. For numa1
the variance improvement will be almost 100%, because after rebalance
each pmd in numa1 holds same workload(variance ~= 0). But the overall
variance improvement is only about 20%, which may not trigger auto_lb.

```
numa_id   core_id      kpps
      0        30         0
      0        31         0
      0        94         0
      0        95         0
      1       126      1500
      1       127      1000
      1        63      1000
      1        62       500
```

As auto_lb doesn't balance workload across numa nodes. So it makes
more sense to calculate variance improvement per numa node.

Signed-off-by: Cheng Li <lic121@chinatelecom.cn>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Co-authored-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
This commit is contained in:
Cheng Li
2022-12-17 13:15:36 +00:00
committed by Ilya Maximets
parent ad6e506fcb
commit 46e04ec31b
2 changed files with 46 additions and 47 deletions

View File

@@ -6131,39 +6131,33 @@ rxq_scheduling(struct dp_netdev *dp)
static uint64_t variance(uint64_t a[], int n);
static uint64_t
sched_numa_list_variance(struct sched_numa_list *numa_list)
sched_numa_variance(struct sched_numa *numa)
{
struct sched_numa *numa;
uint64_t *percent_busy = NULL;
unsigned total_pmds = 0;
int n_proc = 0;
uint64_t var;
HMAP_FOR_EACH (numa, node, &numa_list->numas) {
total_pmds += numa->n_pmds;
percent_busy = xrealloc(percent_busy,
total_pmds * sizeof *percent_busy);
percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
for (unsigned i = 0; i < numa->n_pmds; i++) {
struct sched_pmd *sched_pmd;
uint64_t total_cycles = 0;
for (unsigned i = 0; i < numa->n_pmds; i++) {
struct sched_pmd *sched_pmd;
uint64_t total_cycles = 0;
sched_pmd = &numa->pmds[i];
/* Exclude isolated PMDs from variance calculations. */
if (sched_pmd->isolated == true) {
continue;
}
/* Get the total pmd cycles for an interval. */
atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
sched_pmd = &numa->pmds[i];
/* Exclude isolated PMDs from variance calculations. */
if (sched_pmd->isolated == true) {
continue;
}
/* Get the total pmd cycles for an interval. */
atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
if (total_cycles) {
/* Estimate the cycles to cover all intervals. */
total_cycles *= PMD_INTERVAL_MAX;
percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
/ total_cycles;
} else {
percent_busy[n_proc++] = 0;
}
if (total_cycles) {
/* Estimate the cycles to cover all intervals. */
total_cycles *= PMD_INTERVAL_MAX;
percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
/ total_cycles;
} else {
percent_busy[n_proc++] = 0;
}
}
var = variance(percent_busy, n_proc);
@@ -6237,6 +6231,7 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
struct sched_numa_list numa_list_est;
bool thresh_met = false;
uint64_t current_var, estimate_var;
struct sched_numa *numa_cur, *numa_est;
uint64_t improvement = 0;
VLOG_DBG("PMD auto load balance performing dry run.");
@@ -6255,25 +6250,29 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
sched_numa_list_count(&numa_list_est) == 1) {
/* Calculate variances. */
current_var = sched_numa_list_variance(&numa_list_cur);
estimate_var = sched_numa_list_variance(&numa_list_est);
if (estimate_var < current_var) {
improvement = ((current_var - estimate_var) * 100) / current_var;
}
VLOG_DBG("Current variance %"PRIu64" Estimated variance %"PRIu64".",
current_var, estimate_var);
VLOG_DBG("Variance improvement %"PRIu64"%%.", improvement);
if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
thresh_met = true;
VLOG_DBG("PMD load variance improvement threshold %u%% "
"is met.", dp->pmd_alb.rebalance_improve_thresh);
} else {
VLOG_DBG("PMD load variance improvement threshold "
"%u%% is not met.",
dp->pmd_alb.rebalance_improve_thresh);
HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
numa_est = sched_numa_list_lookup(&numa_list_est,
numa_cur->numa_id);
if (!numa_est) {
continue;
}
current_var = sched_numa_variance(numa_cur);
estimate_var = sched_numa_variance(numa_est);
if (estimate_var < current_var) {
improvement = ((current_var - estimate_var) * 100)
/ current_var;
}
VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
"variance %"PRIu64". Variance improvement %"PRIu64"%%.",
numa_cur->numa_id, current_var,
estimate_var, improvement);
if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
thresh_met = true;
}
}
VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
dp->pmd_alb.rebalance_improve_thresh,
thresh_met ? "met" : "not met");
} else {
VLOG_DBG("PMD auto load balance detected cross-numa polling with "
"multiple numa nodes. Unable to accurately estimate.");