2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 01:51:26 +00:00
ovs/tests/pmd.at

1707 lines
67 KiB
Plaintext
Raw Permalink Normal View History

AT_BANNER([PMD])
m4_divert_push([PREPARE_TESTS])
# Given the output of `ovs-appctl dpif-netdev/pmd-rxq-show`, prints a list
# of every rxq (one per line) in the form:
# port_name rxq_id numa_id core_id
parse_pmd_rxq_show () {
awk '/pmd thread/ {numa=$4; core=substr($6, 1, length($6) - 1)} /^ port:/ {print $2, $4, numa, core}' | sort
}
# Given the output of `ovs-appctl dpif-netdev/pmd-rxq-show`,
# and with queues for each core on one line, prints the rxqs
# of the core on one line
# 'port:' port_name 'queue_id:' rxq_id rxq_id rxq_id rxq_id
parse_pmd_rxq_show_group () {
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
awk '/port:/ {print $1, $2, $3, $4, $13, $22, $31}'
}
# Given the output of `ovs-appctl dpctl/dump-flows`, prints a list of flows
# (one per line), with the pmd_id at the beginning of the line
#
flow_dump_prepend_pmd () {
awk '/flow-dump from the main/ {pmd_id=-1; next} /flow-dump from pmd/ {pmd_id=$7; next} {print pmd_id, $0}' | sort
}
m4_divert_pop([PREPARE_TESTS])
dnl CHECK_CPU_DISCOVERED([n_cpu])
dnl
dnl Waits until CPUs discovered and checks if number of discovered CPUs
dnl is greater or equal to 'n_cpu'. Without parameters checks that at
dnl least one CPU discovered.
m4_define([CHECK_CPU_DISCOVERED], [
PATTERN="Discovered [[0-9]]* NUMA nodes and [[0-9]]* CPU cores"
OVS_WAIT_UNTIL([grep "$PATTERN" ovs-vswitchd.log])
N_CPU=$(grep "$PATTERN" ovs-vswitchd.log | sed -e 's/.* \([[0-9]]*\) CPU cores/\1/')
if [[ -z "$1" ]]
then AT_CHECK([test "$N_CPU" -gt "0"])
else AT_SKIP_IF([test "$N_CPU" -lt "$1"])
fi
])
dnl CHECK_PMD_THREADS_CREATED([n_threads], [numa_id], [+line])
dnl
dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
m4_define([CHECK_PMD_THREADS_CREATED], [
PATTERN="There are [[0-9]]* pmd threads on numa node $2"
line_st=$3
if [[ -z "$line_st" ]]
then
line_st="+0"
fi
OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$PATTERN"])
N_THREADS=$(tail -n $line_st ovs-vswitchd.log | grep "$PATTERN" | tail -1 | sed -e 's/.* \([[0-9]]*\) pmd .*/\1/')
if [[ -z "$1" ]]
then AT_CHECK([test "$N_THREADS" -gt 0])
else AT_CHECK([test "$N_THREADS" -eq "$1"])
fi
])
dnl CHECK_DP_SLEEP_MAX([max_sleep], [+line])
dnl
dnl Checks correct pmd load based sleep value for the datapath.
dnl Checking starts from line number 'line' in ovs-vswithd.log .
m4_define([CHECK_DP_SLEEP_MAX], [
SLEEP_TIME="Default PMD thread max sleep: *[$1] us."
line_st=$2
if [[ -z "$line_st" ]]
then
line_st="+0"
fi
OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_TIME"])
])
dnl CHECK_PMD_SLEEP_MAX([core_id], [numa_id], [max_sleep], [+line])
dnl
dnl Checks max sleep time of each pmd with core_id.
dnl Checking starts from line number 'line' in ovs-vswithd.log .
m4_define([CHECK_PMD_SLEEP_MAX], [
PATTERN="PMD thread on numa_id: *[$1], core id: *[$2], max sleep: *[$3] us."
line_st=$4
if [[ -z "$line_st" ]]
then
line_st="+0"
fi
OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$PATTERN"])
])
m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/"])
m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"])
AT_SETUP([PMD - creating a thread/add-port])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd], [], [], [DUMMY_NUMA])
CHECK_CPU_DISCOVERED()
CHECK_PMD_THREADS_CREATED()
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id <cleared> core_id <cleared>:
isolated : false
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
])
AT_CHECK([ovs-appctl dpif/show], [0], [dnl
dummy@ovs-dummy: hit:0 missed:0
br0:
br0 65534/100: (dummy-internal)
p0 1/1: (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0)
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - multiqueue support])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd], [], [], [DUMMY_NUMA])
CHECK_CPU_DISCOVERED()
CHECK_PMD_THREADS_CREATED()
AT_CHECK([ovs-vsctl set interface p0 options:n_rxq=8])
AT_CHECK([ovs-appctl dpif/show], [0], [dnl
dummy@ovs-dummy: hit:0 missed:0
br0:
br0 65534/100: (dummy-internal)
p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=0)
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id <cleared> core_id <cleared>:
isolated : false
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - pmd-cpu-mask/distribution of rx queues])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8],
[], [], [DUMMY_NUMA])
CHECK_CPU_DISCOVERED(2)
CHECK_PMD_THREADS_CREATED()
AT_CHECK([ovs-appctl dpif/show], [0], [dnl
dummy@ovs-dummy: hit:0 missed:0
br0:
br0 65534/100: (dummy-internal)
p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=0)
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id <cleared> core_id <cleared>:
isolated : false
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
])
dpif-netdev: Add group rxq scheduling assignment type. Add an rxq scheduling option that allows rxqs to be grouped on a pmd based purely on their load. The current default 'cycles' assignment sorts rxqs by measured processing load and then assigns them to a list of round robin PMDs. This helps to keep the rxqs that require most processing on different cores but as it selects the PMDs in round robin order, it equally distributes rxqs to PMDs. 'cycles' assignment has the advantage in that it separates the most loaded rxqs from being on the same core but maintains the rxqs being spread across a broad range of PMDs to mitigate against changes to traffic pattern. 'cycles' assignment has the disadvantage that in order to make the trade off between optimising for current traffic load and mitigating against future changes, it tries to assign and equal amount of rxqs per PMD in a round robin manner and this can lead to a less than optimal balance of the processing load. Now that PMD auto load balance can help mitigate with future changes in traffic patterns, a 'group' assignment can be used to assign rxqs based on their measured cycles and the estimated running total of the PMDs. In this case, there is no restriction about keeping equal number of rxqs per PMD as it is purely load based. This means that one PMD may have a group of low load rxqs assigned to it while another PMD has one high load rxq assigned to it, as that is the best balance of their measured loads across the PMDs. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Acked-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 17:02:09 +01:00
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using roundrobin algorithm"])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3])
CHECK_PMD_THREADS_CREATED([2], [], [+$TMP])
dpif-netdev: Add group rxq scheduling assignment type. Add an rxq scheduling option that allows rxqs to be grouped on a pmd based purely on their load. The current default 'cycles' assignment sorts rxqs by measured processing load and then assigns them to a list of round robin PMDs. This helps to keep the rxqs that require most processing on different cores but as it selects the PMDs in round robin order, it equally distributes rxqs to PMDs. 'cycles' assignment has the advantage in that it separates the most loaded rxqs from being on the same core but maintains the rxqs being spread across a broad range of PMDs to mitigate against changes to traffic pattern. 'cycles' assignment has the disadvantage that in order to make the trade off between optimising for current traffic load and mitigating against future changes, it tries to assign and equal amount of rxqs per PMD in a round robin manner and this can lead to a less than optimal balance of the processing load. Now that PMD auto load balance can help mitigate with future changes in traffic patterns, a 'group' assignment can be used to assign rxqs based on their measured cycles and the estimated running total of the PMDs. In this case, there is no restriction about keeping equal number of rxqs per PMD as it is purely load based. This means that one PMD may have a group of low load rxqs assigned to it while another PMD has one high load rxq assigned to it, as that is the best balance of their measured loads across the PMDs. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Acked-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 17:02:09 +01:00
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
port: p0 queue-id: 0 2 4 6
port: p0 queue-id: 1 3 5 7
])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using cycles algorithm"])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
port: p0 queue-id: 0 3 4 7
port: p0 queue-id: 1 2 5 6
])
dpif-netdev: Add group rxq scheduling assignment type. Add an rxq scheduling option that allows rxqs to be grouped on a pmd based purely on their load. The current default 'cycles' assignment sorts rxqs by measured processing load and then assigns them to a list of round robin PMDs. This helps to keep the rxqs that require most processing on different cores but as it selects the PMDs in round robin order, it equally distributes rxqs to PMDs. 'cycles' assignment has the advantage in that it separates the most loaded rxqs from being on the same core but maintains the rxqs being spread across a broad range of PMDs to mitigate against changes to traffic pattern. 'cycles' assignment has the disadvantage that in order to make the trade off between optimising for current traffic load and mitigating against future changes, it tries to assign and equal amount of rxqs per PMD in a round robin manner and this can lead to a less than optimal balance of the processing load. Now that PMD auto load balance can help mitigate with future changes in traffic patterns, a 'group' assignment can be used to assign rxqs based on their measured cycles and the estimated running total of the PMDs. In this case, there is no restriction about keeping equal number of rxqs per PMD as it is purely load based. This means that one PMD may have a group of low load rxqs assigned to it while another PMD has one high load rxq assigned to it, as that is the best balance of their measured loads across the PMDs. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Acked-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 17:02:09 +01:00
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=group])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
port: p0 queue-id: 0 2 4 6
port: p0 queue-id: 1 3 5 7
])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1])
CHECK_PMD_THREADS_CREATED([1], [], [+$TMP])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id <cleared> core_id <cleared>:
isolated : false
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - pmd-cpu-mask - dual NUMA])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8 options:numa_id=1 -- set Open_vSwitch . other_config:pmd-cpu-mask=1],
[], [], [--dummy-numa 1,1,0,0])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
CHECK_CPU_DISCOVERED(4)
CHECK_PMD_THREADS_CREATED()
AT_CHECK([ovs-appctl dpif/show], [0], [dnl
dummy@ovs-dummy: hit:0 missed:0
br0:
br0 65534/100: (dummy-internal)
p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=1)
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id <cleared> core_id <cleared>:
isolated : false
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
])
# Force cross-numa polling
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0xc])
CHECK_PMD_THREADS_CREATED([2], [0], [+$TMP])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using cycles algorithm"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "There's no available (non-isolated) pmd thread on numa node 1. Port 'p0' rx queue 7 will be assigned to a pmd on numa node 0. This may lead to reduced performance."])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
port: p0 queue-id: 0 3 4 7
port: p0 queue-id: 1 2 5 6
])
# Check other assignment types for cross-numa polling
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using roundrobin algorithm"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "There's no available (non-isolated) pmd thread on numa node 1. Port 'p0' rx queue 7 will be assigned to a pmd on numa node 0. This may lead to reduced performance."])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
port: p0 queue-id: 0 2 4 6
port: p0 queue-id: 1 3 5 7
])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=group])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "There's no available (non-isolated) pmd thread on numa node 1. Port 'p0' rx queue 7 will be assigned to a pmd on numa node 0. This may lead to reduced performance."])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
port: p0 queue-id: 0 2 4 6
port: p0 queue-id: 1 3 5 7
])
# Switch back to same numa
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3])
CHECK_PMD_THREADS_CREATED([2], [1], [+$TMP])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
port: p0 queue-id: 0 2 4 6
port: p0 queue-id: 1 3 5 7
])
# Check local numa is only used if available
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x6])
CHECK_PMD_THREADS_CREATED([1], [0], [+$TMP])
CHECK_PMD_THREADS_CREATED([1], [1], [+$TMP])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id 1 core_id 1:
isolated : false
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
pmd thread numa_id 0 core_id 2:
isolated : false
])
# Check other assignment types
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using roundrobin algorithm"])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id 1 core_id 1:
isolated : false
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
pmd thread numa_id 0 core_id 2:
isolated : false
])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using cycles algorithm"])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id 1 core_id 1:
isolated : false
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
pmd thread numa_id 0 core_id 2:
isolated : false
])
# Switch back from mixed numa to single numa
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1])
CHECK_PMD_THREADS_CREATED([1], [1], [+$TMP])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
Displaying last 60 seconds pmd usage %
pmd thread numa_id 1 core_id 0:
isolated : false
port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 1 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 2 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 3 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 4 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 5 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 6 (enabled) pmd usage: NOT AVAIL
port: p0 queue-id: 7 (enabled) pmd usage: NOT AVAIL
dpif-netdev: Report overhead busy cycles per pmd. Users complained that per rxq pmd usage was confusing: summing those values per pmd would never reach 100% even if increasing traffic load beyond pmd capacity. This is because the dpif-netdev/pmd-rxq-show command only reports "pure" rxq cycles while some cycles are used in the pmd mainloop and adds up to the total pmd load. dpif-netdev/pmd-stats-show does report per pmd load usage. This load is measured since the last dpif-netdev/pmd-stats-clear call. On the other hand, the per rxq pmd usage reflects the pmd load on a 10s sliding window which makes it non trivial to correlate. Gather per pmd busy cycles with the same periodicity and report the difference as overhead in dpif-netdev/pmd-rxq-show so that we have all info in a single command. Example: $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 1 core_id 3: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 90 % overhead: 4 % pmd thread numa_id 1 core_id 5: isolated : false port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost1 queue-id: 0 (enabled) pmd usage: 93 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % port: vhost6 queue-id: 0 (enabled) pmd usage: 0 % overhead: 6 % pmd thread numa_id 1 core_id 31: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 86 % overhead: 4 % pmd thread numa_id 1 core_id 33: isolated : false port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % port: vhost4 queue-id: 0 (enabled) pmd usage: 0 % port: vhost5 queue-id: 0 (enabled) pmd usage: 92 % port: vhost7 queue-id: 0 (enabled) pmd usage: 0 % overhead: 7 % Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2021-07-16 18:21:16 +02:00
overhead: NOT AVAIL
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - pmd-cpu-mask - multi NUMA])
OVS_VSWITCHD_START([add-port br0 p0 \
-- set Interface p0 type=dummy-pmd options:n_rxq=4 \
-- set Interface p0 options:numa_id=0 \
-- set Open_vSwitch . other_config:pmd-cpu-mask=0xf \
-- set open_vswitch . other_config:pmd-rxq-assign=cycles],
[], [], [--dummy-numa 1,2,1,2])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=group])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "There's no available (non-isolated) pmd thread on numa node 0."])
# check all pmds from both non-local numas are assigned an rxq
AT_CHECK([test `ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | wc -l` -eq 4])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using cycles algorithm"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "There's no available (non-isolated) pmd thread on numa node 0."])
# check all pmds from both non-local numas are assigned an rxq
AT_CHECK([test `ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | wc -l` -eq 4])
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using roundrobin algorithm"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "There's no available (non-isolated) pmd thread on numa node 0."])
# check all pmds from both non-local numas are assigned an rxq
AT_CHECK([test `ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | wc -l` -eq 4])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - stats])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 ofport_request=7 type=dummy-pmd options:n_rxq=4],
[], [], [DUMMY_NUMA])
CHECK_CPU_DISCOVERED()
CHECK_PMD_THREADS_CREATED()
AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=normal])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=1])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:smc-enable=true])
sleep 1
AT_CHECK([ovs-appctl dpif/show], [0], [dnl
dummy@ovs-dummy: hit:0 missed:0
br0:
br0 65534/100: (dummy-internal)
p0 7/1: (dummy-pmd: n_rxq=4, n_txq=1, numa_id=0)
])
dpif-netdev: Forwarding optimization for flows with a simple match. There are cases where users might want simple forwarding or drop rules for all packets received from a specific port, e.g :: "in_port=1,actions=2" "in_port=2,actions=IN_PORT" "in_port=3,vlan_tci=0x1234/0x1fff,actions=drop" "in_port=4,actions=push_vlan:0x8100,set_field:4196->vlan_vid,output:3" There are also cases where complex OpenFlow rules can be simplified down to datapath flows with very simple match criteria. In theory, for very simple forwarding, OVS doesn't need to parse packets at all in order to follow these rules. "Simple match" lookup optimization is intended to speed up packet forwarding in these cases. Design: Due to various implementation constraints userspace datapath has following flow fields always in exact match (i.e. it's required to match at least these fields of a packet even if the OF rule doesn't need that): - recirc_id - in_port - packet_type - dl_type - vlan_tci (CFI + VID) - in most cases - nw_frag - for ip packets Not all of these fields are related to packet itself. We already know the current 'recirc_id' and the 'in_port' before starting the packet processing. It also seems safe to assume that we're working with Ethernet packets. So, for the simple OF rule we need to match only on 'dl_type', 'vlan_tci' and 'nw_frag'. 'in_port', 'dl_type', 'nw_frag' and 13 bits of 'vlan_tci' can be combined in a single 64bit integer (mark) that can be used as a hash in hash map. We are using only VID and CFI form the 'vlan_tci', flows that need to match on PCP will not qualify for the optimization. Workaround for matching on non-existence of vlan updated to match on CFI and VID only in order to qualify for the optimization. CFI is always set by OVS if vlan is present in a packet, so there is no need to match on PCP in this case. 'nw_frag' takes 2 bits of PCP inside the simple match mark. New per-PMD flow table 'simple_match_table' introduced to store simple match flows only. 'dp_netdev_flow_add' adds flow to the usual 'flow_table' and to the 'simple_match_table' if the flow meets following constraints: - 'recirc_id' in flow match is 0. - 'packet_type' in flow match is Ethernet. - Flow wildcards contains only minimal set of non-wildcarded fields (listed above). If the number of flows for current 'in_port' in a regular 'flow_table' equals number of flows for current 'in_port' in a 'simple_match_table', we may use simple match optimization, because all the flows we have are simple match flows. This means that we only need to parse 'dl_type', 'vlan_tci' and 'nw_frag' to perform packet matching. Now we make the unique flow mark from the 'in_port', 'dl_type', 'nw_frag' and 'vlan_tci' and looking for it in the 'simple_match_table'. On successful lookup we don't need to run full 'miniflow_extract()'. Unsuccessful lookup technically means that we have no suitable flow in the datapath and upcall will be required. So, in this case EMC and SMC lookups are disabled. We may optimize this path in the future by bypassing the dpcls lookup too. Performance improvement of this solution on a 'simple match' flows should be comparable with partial HW offloading, because it parses same packet fields and uses similar flow lookup scheme. However, unlike partial HW offloading, it works for all port types including virtual ones. Performance results when compared to EMC: Test setup: virtio-user OVS virtio-user Testpmd1 ------------> pmd1 ------------> Testpmd2 (txonly) x<------ pmd2 <------------ (mac swap) Single stream of 64byte packets. Actions: in_port=vhost0,actions=vhost1 in_port=vhost1,actions=vhost0 Stats collected from pmd1 and pmd2, so there are 2 scenarios: Virt-to-Virt : Testpmd1 ------> pmd1 ------> Testpmd2. Virt-to-NoCopy : Testpmd2 ------> pmd2 --->x Testpmd1. Here the packet sent from pmd2 to Testpmd1 is always dropped, because the virtqueue is full since Testpmd1 is in txonly mode and doesn't receive any packets. This should be closer to the performance of a VM-to-Phy scenario. Test performed on machine with Intel Xeon CPU E5-2690 v4 @ 2.60GHz. Table below represents improvement in throughput when compared to EMC. +----------------+------------------------+------------------------+ | | Default (-g -O2) | "-Ofast -march=native" | | Scenario +------------+-----------+------------+-----------+ | | GCC | Clang | GCC | Clang | +----------------+------------+-----------+------------+-----------+ | Virt-to-Virt | +18.9% | +25.5% | +10.8% | +16.7% | | Virt-to-NoCopy | +24.3% | +33.7% | +14.9% | +22.0% | +----------------+------------+-----------+------------+-----------+ For Phy-to-Phy case performance improvement should be even higher, but it's not the main use-case for this functionality. Performance difference for the non-simple flows is within a margin of error. Acked-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-09 14:57:52 +02:00
AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN | sed '/cycles/d' | grep pmd -A 12], [0], [dnl
pmd thread numa_id <cleared> core_id <cleared>:
packets received: 0
packet recirculations: 0
avg. datapath passes per packet: 0.00
phwol hits: 0
mfex opt hits: 0
dpif-netdev: Forwarding optimization for flows with a simple match. There are cases where users might want simple forwarding or drop rules for all packets received from a specific port, e.g :: "in_port=1,actions=2" "in_port=2,actions=IN_PORT" "in_port=3,vlan_tci=0x1234/0x1fff,actions=drop" "in_port=4,actions=push_vlan:0x8100,set_field:4196->vlan_vid,output:3" There are also cases where complex OpenFlow rules can be simplified down to datapath flows with very simple match criteria. In theory, for very simple forwarding, OVS doesn't need to parse packets at all in order to follow these rules. "Simple match" lookup optimization is intended to speed up packet forwarding in these cases. Design: Due to various implementation constraints userspace datapath has following flow fields always in exact match (i.e. it's required to match at least these fields of a packet even if the OF rule doesn't need that): - recirc_id - in_port - packet_type - dl_type - vlan_tci (CFI + VID) - in most cases - nw_frag - for ip packets Not all of these fields are related to packet itself. We already know the current 'recirc_id' and the 'in_port' before starting the packet processing. It also seems safe to assume that we're working with Ethernet packets. So, for the simple OF rule we need to match only on 'dl_type', 'vlan_tci' and 'nw_frag'. 'in_port', 'dl_type', 'nw_frag' and 13 bits of 'vlan_tci' can be combined in a single 64bit integer (mark) that can be used as a hash in hash map. We are using only VID and CFI form the 'vlan_tci', flows that need to match on PCP will not qualify for the optimization. Workaround for matching on non-existence of vlan updated to match on CFI and VID only in order to qualify for the optimization. CFI is always set by OVS if vlan is present in a packet, so there is no need to match on PCP in this case. 'nw_frag' takes 2 bits of PCP inside the simple match mark. New per-PMD flow table 'simple_match_table' introduced to store simple match flows only. 'dp_netdev_flow_add' adds flow to the usual 'flow_table' and to the 'simple_match_table' if the flow meets following constraints: - 'recirc_id' in flow match is 0. - 'packet_type' in flow match is Ethernet. - Flow wildcards contains only minimal set of non-wildcarded fields (listed above). If the number of flows for current 'in_port' in a regular 'flow_table' equals number of flows for current 'in_port' in a 'simple_match_table', we may use simple match optimization, because all the flows we have are simple match flows. This means that we only need to parse 'dl_type', 'vlan_tci' and 'nw_frag' to perform packet matching. Now we make the unique flow mark from the 'in_port', 'dl_type', 'nw_frag' and 'vlan_tci' and looking for it in the 'simple_match_table'. On successful lookup we don't need to run full 'miniflow_extract()'. Unsuccessful lookup technically means that we have no suitable flow in the datapath and upcall will be required. So, in this case EMC and SMC lookups are disabled. We may optimize this path in the future by bypassing the dpcls lookup too. Performance improvement of this solution on a 'simple match' flows should be comparable with partial HW offloading, because it parses same packet fields and uses similar flow lookup scheme. However, unlike partial HW offloading, it works for all port types including virtual ones. Performance results when compared to EMC: Test setup: virtio-user OVS virtio-user Testpmd1 ------------> pmd1 ------------> Testpmd2 (txonly) x<------ pmd2 <------------ (mac swap) Single stream of 64byte packets. Actions: in_port=vhost0,actions=vhost1 in_port=vhost1,actions=vhost0 Stats collected from pmd1 and pmd2, so there are 2 scenarios: Virt-to-Virt : Testpmd1 ------> pmd1 ------> Testpmd2. Virt-to-NoCopy : Testpmd2 ------> pmd2 --->x Testpmd1. Here the packet sent from pmd2 to Testpmd1 is always dropped, because the virtqueue is full since Testpmd1 is in txonly mode and doesn't receive any packets. This should be closer to the performance of a VM-to-Phy scenario. Test performed on machine with Intel Xeon CPU E5-2690 v4 @ 2.60GHz. Table below represents improvement in throughput when compared to EMC. +----------------+------------------------+------------------------+ | | Default (-g -O2) | "-Ofast -march=native" | | Scenario +------------+-----------+------------+-----------+ | | GCC | Clang | GCC | Clang | +----------------+------------+-----------+------------+-----------+ | Virt-to-Virt | +18.9% | +25.5% | +10.8% | +16.7% | | Virt-to-NoCopy | +24.3% | +33.7% | +14.9% | +22.0% | +----------------+------------+-----------+------------+-----------+ For Phy-to-Phy case performance improvement should be even higher, but it's not the main use-case for this functionality. Performance difference for the non-simple flows is within a margin of error. Acked-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-09 14:57:52 +02:00
simple match hits: 0
emc hits: 0
smc hits: 0
megaflow hits: 0
avg. subtable lookups per megaflow hit: 0.00
miss with success upcall: 0
miss with failed upcall: 0
])
ovs-appctl time/stop
ovs-appctl time/warp 100
(
for i in `seq 0 19`;
do
pkt="in_port(7),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)"
AT_CHECK([ovs-appctl netdev-dummy/receive p0 $pkt])
done
)
ovs-appctl time/warp 100
AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl
recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)
])
AT_CHECK([cat ovs-vswitchd.log | filter_flow_install | strip_xout], [0], [dnl
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(frag=no), actions: <del>
])
dpif-netdev: Forwarding optimization for flows with a simple match. There are cases where users might want simple forwarding or drop rules for all packets received from a specific port, e.g :: "in_port=1,actions=2" "in_port=2,actions=IN_PORT" "in_port=3,vlan_tci=0x1234/0x1fff,actions=drop" "in_port=4,actions=push_vlan:0x8100,set_field:4196->vlan_vid,output:3" There are also cases where complex OpenFlow rules can be simplified down to datapath flows with very simple match criteria. In theory, for very simple forwarding, OVS doesn't need to parse packets at all in order to follow these rules. "Simple match" lookup optimization is intended to speed up packet forwarding in these cases. Design: Due to various implementation constraints userspace datapath has following flow fields always in exact match (i.e. it's required to match at least these fields of a packet even if the OF rule doesn't need that): - recirc_id - in_port - packet_type - dl_type - vlan_tci (CFI + VID) - in most cases - nw_frag - for ip packets Not all of these fields are related to packet itself. We already know the current 'recirc_id' and the 'in_port' before starting the packet processing. It also seems safe to assume that we're working with Ethernet packets. So, for the simple OF rule we need to match only on 'dl_type', 'vlan_tci' and 'nw_frag'. 'in_port', 'dl_type', 'nw_frag' and 13 bits of 'vlan_tci' can be combined in a single 64bit integer (mark) that can be used as a hash in hash map. We are using only VID and CFI form the 'vlan_tci', flows that need to match on PCP will not qualify for the optimization. Workaround for matching on non-existence of vlan updated to match on CFI and VID only in order to qualify for the optimization. CFI is always set by OVS if vlan is present in a packet, so there is no need to match on PCP in this case. 'nw_frag' takes 2 bits of PCP inside the simple match mark. New per-PMD flow table 'simple_match_table' introduced to store simple match flows only. 'dp_netdev_flow_add' adds flow to the usual 'flow_table' and to the 'simple_match_table' if the flow meets following constraints: - 'recirc_id' in flow match is 0. - 'packet_type' in flow match is Ethernet. - Flow wildcards contains only minimal set of non-wildcarded fields (listed above). If the number of flows for current 'in_port' in a regular 'flow_table' equals number of flows for current 'in_port' in a 'simple_match_table', we may use simple match optimization, because all the flows we have are simple match flows. This means that we only need to parse 'dl_type', 'vlan_tci' and 'nw_frag' to perform packet matching. Now we make the unique flow mark from the 'in_port', 'dl_type', 'nw_frag' and 'vlan_tci' and looking for it in the 'simple_match_table'. On successful lookup we don't need to run full 'miniflow_extract()'. Unsuccessful lookup technically means that we have no suitable flow in the datapath and upcall will be required. So, in this case EMC and SMC lookups are disabled. We may optimize this path in the future by bypassing the dpcls lookup too. Performance improvement of this solution on a 'simple match' flows should be comparable with partial HW offloading, because it parses same packet fields and uses similar flow lookup scheme. However, unlike partial HW offloading, it works for all port types including virtual ones. Performance results when compared to EMC: Test setup: virtio-user OVS virtio-user Testpmd1 ------------> pmd1 ------------> Testpmd2 (txonly) x<------ pmd2 <------------ (mac swap) Single stream of 64byte packets. Actions: in_port=vhost0,actions=vhost1 in_port=vhost1,actions=vhost0 Stats collected from pmd1 and pmd2, so there are 2 scenarios: Virt-to-Virt : Testpmd1 ------> pmd1 ------> Testpmd2. Virt-to-NoCopy : Testpmd2 ------> pmd2 --->x Testpmd1. Here the packet sent from pmd2 to Testpmd1 is always dropped, because the virtqueue is full since Testpmd1 is in txonly mode and doesn't receive any packets. This should be closer to the performance of a VM-to-Phy scenario. Test performed on machine with Intel Xeon CPU E5-2690 v4 @ 2.60GHz. Table below represents improvement in throughput when compared to EMC. +----------------+------------------------+------------------------+ | | Default (-g -O2) | "-Ofast -march=native" | | Scenario +------------+-----------+------------+-----------+ | | GCC | Clang | GCC | Clang | +----------------+------------+-----------+------------+-----------+ | Virt-to-Virt | +18.9% | +25.5% | +10.8% | +16.7% | | Virt-to-NoCopy | +24.3% | +33.7% | +14.9% | +22.0% | +----------------+------------+-----------+------------+-----------+ For Phy-to-Phy case performance improvement should be even higher, but it's not the main use-case for this functionality. Performance difference for the non-simple flows is within a margin of error. Acked-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-09 14:57:52 +02:00
AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN | sed '/cycles/d' | grep pmd -A 12], [0], [dnl
pmd thread numa_id <cleared> core_id <cleared>:
packets received: 20
packet recirculations: 0
avg. datapath passes per packet: 1.00
phwol hits: 0
mfex opt hits: 0
dpif-netdev: Forwarding optimization for flows with a simple match. There are cases where users might want simple forwarding or drop rules for all packets received from a specific port, e.g :: "in_port=1,actions=2" "in_port=2,actions=IN_PORT" "in_port=3,vlan_tci=0x1234/0x1fff,actions=drop" "in_port=4,actions=push_vlan:0x8100,set_field:4196->vlan_vid,output:3" There are also cases where complex OpenFlow rules can be simplified down to datapath flows with very simple match criteria. In theory, for very simple forwarding, OVS doesn't need to parse packets at all in order to follow these rules. "Simple match" lookup optimization is intended to speed up packet forwarding in these cases. Design: Due to various implementation constraints userspace datapath has following flow fields always in exact match (i.e. it's required to match at least these fields of a packet even if the OF rule doesn't need that): - recirc_id - in_port - packet_type - dl_type - vlan_tci (CFI + VID) - in most cases - nw_frag - for ip packets Not all of these fields are related to packet itself. We already know the current 'recirc_id' and the 'in_port' before starting the packet processing. It also seems safe to assume that we're working with Ethernet packets. So, for the simple OF rule we need to match only on 'dl_type', 'vlan_tci' and 'nw_frag'. 'in_port', 'dl_type', 'nw_frag' and 13 bits of 'vlan_tci' can be combined in a single 64bit integer (mark) that can be used as a hash in hash map. We are using only VID and CFI form the 'vlan_tci', flows that need to match on PCP will not qualify for the optimization. Workaround for matching on non-existence of vlan updated to match on CFI and VID only in order to qualify for the optimization. CFI is always set by OVS if vlan is present in a packet, so there is no need to match on PCP in this case. 'nw_frag' takes 2 bits of PCP inside the simple match mark. New per-PMD flow table 'simple_match_table' introduced to store simple match flows only. 'dp_netdev_flow_add' adds flow to the usual 'flow_table' and to the 'simple_match_table' if the flow meets following constraints: - 'recirc_id' in flow match is 0. - 'packet_type' in flow match is Ethernet. - Flow wildcards contains only minimal set of non-wildcarded fields (listed above). If the number of flows for current 'in_port' in a regular 'flow_table' equals number of flows for current 'in_port' in a 'simple_match_table', we may use simple match optimization, because all the flows we have are simple match flows. This means that we only need to parse 'dl_type', 'vlan_tci' and 'nw_frag' to perform packet matching. Now we make the unique flow mark from the 'in_port', 'dl_type', 'nw_frag' and 'vlan_tci' and looking for it in the 'simple_match_table'. On successful lookup we don't need to run full 'miniflow_extract()'. Unsuccessful lookup technically means that we have no suitable flow in the datapath and upcall will be required. So, in this case EMC and SMC lookups are disabled. We may optimize this path in the future by bypassing the dpcls lookup too. Performance improvement of this solution on a 'simple match' flows should be comparable with partial HW offloading, because it parses same packet fields and uses similar flow lookup scheme. However, unlike partial HW offloading, it works for all port types including virtual ones. Performance results when compared to EMC: Test setup: virtio-user OVS virtio-user Testpmd1 ------------> pmd1 ------------> Testpmd2 (txonly) x<------ pmd2 <------------ (mac swap) Single stream of 64byte packets. Actions: in_port=vhost0,actions=vhost1 in_port=vhost1,actions=vhost0 Stats collected from pmd1 and pmd2, so there are 2 scenarios: Virt-to-Virt : Testpmd1 ------> pmd1 ------> Testpmd2. Virt-to-NoCopy : Testpmd2 ------> pmd2 --->x Testpmd1. Here the packet sent from pmd2 to Testpmd1 is always dropped, because the virtqueue is full since Testpmd1 is in txonly mode and doesn't receive any packets. This should be closer to the performance of a VM-to-Phy scenario. Test performed on machine with Intel Xeon CPU E5-2690 v4 @ 2.60GHz. Table below represents improvement in throughput when compared to EMC. +----------------+------------------------+------------------------+ | | Default (-g -O2) | "-Ofast -march=native" | | Scenario +------------+-----------+------------+-----------+ | | GCC | Clang | GCC | Clang | +----------------+------------+-----------+------------+-----------+ | Virt-to-Virt | +18.9% | +25.5% | +10.8% | +16.7% | | Virt-to-NoCopy | +24.3% | +33.7% | +14.9% | +22.0% | +----------------+------------+-----------+------------+-----------+ For Phy-to-Phy case performance improvement should be even higher, but it's not the main use-case for this functionality. Performance difference for the non-simple flows is within a margin of error. Acked-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-09 14:57:52 +02:00
simple match hits: 0
emc hits: 19
smc hits: 0
megaflow hits: 0
avg. subtable lookups per megaflow hit: 0.00
miss with success upcall: 1
miss with failed upcall: 0
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - pmd-rxq-show pmd usage time])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd], [], [], [DUMMY_NUMA])
#CHECK_CPU_DISCOVERED()
#CHECK_PMD_THREADS_CREATED()
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | grep Displaying], [0], [dnl
Displaying last 60 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs -1 | grep Displaying], [0], [dnl
Displaying last 60 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 0 | grep Displaying], [0], [dnl
Displaying last 60 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 1 | grep Displaying], [0], [dnl
Displaying last 5 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 5 | grep Displaying], [0], [dnl
Displaying last 5 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 6 | grep Displaying], [0], [dnl
Displaying last 10 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 51 | grep Displaying], [0], [dnl
Displaying last 55 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 55 | grep Displaying], [0], [dnl
Displaying last 55 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 56 | grep Displaying], [0], [dnl
Displaying last 60 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 60 | grep Displaying], [0], [dnl
Displaying last 60 seconds pmd usage %
])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 61 | grep Displaying], [0], [dnl
Displaying last 60 seconds pmd usage %
])
OVS_VSWITCHD_STOP
AT_CLEANUP
dnl Reconfigure the number of rx queues of a port, make sure that all the
dnl queues are polled by the datapath and try to send a couple of packets.
AT_SETUP([PMD - reconfigure n_rxq])
OVS_VSWITCHD_START(
[add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2
], [], [], [--dummy-numa 0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=controller])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
p1 1 0 0
p2 0 0 0
])
AT_CAPTURE_FILE([ofctl_monitor.log])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-vsctl set interface p1 options:n_rxq=4])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
p1 1 0 0
p1 2 0 0
p1 3 0 0
p2 0 0 0
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 3 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
dnl Check resetting to default number of rx queues after removal from the db.
AT_CHECK([ovs-vsctl remove interface p1 options n_rxq])
AT_CHECK([ovs-appctl dpif/show | grep p1], [0], [dnl
p1 1/1: (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0)
])
OVS_VSWITCHD_STOP
AT_CLEANUP
dnl There was a bug where OVS failed to create a ukey and install a megaflow
dnl if a packet with the exact same flow was received by two different pmd
dnl threads. This is a regression test for that bug.
AT_SETUP([PMD - same flow multiple threads])
OVS_VSWITCHD_START(
[add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
set Open_vSwitch . other_config:pmd-cpu-mask=3
], [], [], [--dummy-numa 0,0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=controller])
dnl Make sure that the queues are on different cores. There's no way to
dnl control which queue is on which thread, we just need to make sure that
dnl two threads (core_id) show up in pmd-rxq-show
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 4 -d ' ' | sort], [0], [dnl
0
1
])
AT_CAPTURE_FILE([ofctl_monitor.log])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 4])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
dnl Make sure that both flows have been installed
AT_CHECK([ovs-appctl dpctl/dump-flows | flow_dump_prepend_pmd], [0], [dnl
0 recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535))
1 recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535))
])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - change numa node])
OVS_VSWITCHD_START(
[add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
set Open_vSwitch . other_config:pmd-cpu-mask=7
], [], [], [--dummy-numa 0,1,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=controller])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
p1 1 0 0
p2 0 0 0
p2 1 0 0
])
AT_CAPTURE_FILE([ofctl_monitor.log])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-vsctl set Interface p2 options:numa_id=1])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
p1 1 0 0
p2 0 1 1
p2 1 1 1
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 8 2
p1 1 8 2
p2 0 1 1
p2 1 1 1
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - non pmd device])
OVS_VSWITCHD_START(
[add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=1 -- \
add-port br0 p2 -- set Interface p2 type=dummy ofport_request=2 -- \
set Interface br0 options:tx_pcap=br0.pcap -- \
set Open_vSwitch . other_config:pmd-cpu-mask=1
], [], [], [--dummy-numa 0,0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=LOCAL])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `ovs-pcap br0.pcap | wc -l` -ge 2])
AT_CHECK([ovs-pcap br0.pcap], [0], [dnl
50540000000a50540000000908004500005c000000004001669f0a0000020a000001080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
50540000000a50540000000908004500005c000000004001669f0a0000020a000001080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=2])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 1
])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `ovs-pcap br0.pcap | wc -l` -ge 4])
AT_CHECK([ovs-pcap br0.pcap], [0], [dnl
50540000000a50540000000908004500005c000000004001669f0a0000020a000001080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
50540000000a50540000000908004500005c000000004001669f0a0000020a000001080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
50540000000a50540000000908004500005c000000004001669f0a0000020a000001080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
50540000000a50540000000908004500005c000000004001669f0a0000020a000001080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - add remove ports])
OVS_VSWITCHD_START(
[], [], [], [--dummy-numa 0,0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=1])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
])
AT_CAPTURE_FILE([ofctl_monitor.log])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
AT_CHECK([ovs-vsctl del-port br0 p1])
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=1])
AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
AT_CHECK([cat ofctl_monitor.log], [0], [dnl
NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - rxq affinity])
OVS_VSWITCHD_START(
[], [], [], [--dummy-numa 0,0,0,0,0,0,0,0,0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1fe])
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=4 other_config:pmd-rxq-affinity="0:3,1:7,2:2,3:8"])
dnl The rxqs should be on the requested cores.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 3
p1 1 0 7
p1 2 0 2
p1 3 0 8
])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=6])
dnl We removed the cores requested by some queues from pmd-cpu-mask.
dnl Those queues will be polled by remaining non-isolated pmds.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 1
p1 1 0 1
p1 2 0 2
p1 3 0 1
])
# Check they are pinned when those pmds are available again
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1fe])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 3
p1 1 0 7
p1 2 0 2
p1 3 0 8
])
AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity])
dnl We removed the rxq-affinity request. dpif-netdev should assign queues
dnl in a round robin fashion. We just make sure that every rxq is being
dnl polled again.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 1,2 -d ' ' | sort], [0], [dnl
p1 0
p1 1
p1 2
p1 3
])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=6])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity='0:1'])
dnl We explicitly requested core 1 for queue 0. Core 1 becomes isolated and
dnl every other queue goes to core 2.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 1
p1 1 0 2
p1 2 0 2
p1 3 0 2
])
OVS_VSWITCHD_STOP(["/cannot be pinned with port/d"])
AT_CLEANUP
dpif-netdev: Fix crash in dpif_netdev_execute(). dp_netdev_get_pmd() is allowed to return NULL (even if we call it with NON_PMD_CORE_ID) for different reasons: * Since we use RCU to protect pmd threads, it is possible that ovs_refcount_try_ref_rcu() has failed. * During reconfiguration we destroy every thread. This commit makes sure that we always handle the case when dp_netdev_get_pmd() returns NULL without crashing (the change in dpif_netdev_run() doesn't fix anything, because everything is happening in the main thread, but it's better to honor the interface in case we change our threading model). This actually fixes a pretty serious crash that happens if dpif_netdev_execute() is called from a non pmd thread while reconfiguration is happening. It can be triggered by enabling bfd (because it's handled by the monitor thread, which is a non pmd thread) on an interface and changing something that requires datapath reconfiguration (n_rxq, pmd-cpu-mask, mtu). A testcase that reproduces the race condition is included. This is a possible backtrace of the segfault: #0 0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320, packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at ../lib/dpif-netdev.c:4357 #1 0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320, batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658, actions_len=8, dp_execute_action=0x60c7a5 <dp_execute_cb>) at ../lib/odp-execute.c:538 #2 0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0, packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70, actions=0x7f1dd2d2a658, actions_len=8, now=44965873) at ../lib/dpif-netdev.c:4577 #3 0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70, execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624 #4 0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654 #5 0x0000000000610a30 in dpif_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268 #6 0x000000000061098c in dpif_execute (dpif=0x2b67b70, execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233 #7 0x00000000005b9008 in ofproto_dpif_execute_actions__ (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, indentation=0, depth=0, resubmits=0, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806 #8 0x00000000005b907a in ofproto_dpif_execute_actions (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3823 #9 0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-xlate.c:5792 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287 #12 0x00000000005c3d9b in monitor_run () at ../ofproto/ofproto-dpif-monitor.c:227 #13 0x00000000005c3cab in monitor_main (args=0x0) at ../ofproto/ofproto-dpif-monitor.c:189 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at ../lib/ovs-thread.c:342 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at pthread_create.c:333 #16 0x00007f1dd6e1d20d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Ben Pfaff <blp@ovn.org>
2016-10-04 14:53:31 -07:00
AT_SETUP([PMD - rxq affinity - non-isolate])
OVS_VSWITCHD_START(
[], [], [], [--dummy-numa 0,0,0,0,0,0,0,0,0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1fe])
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=4 other_config:pmd-rxq-affinity="0:3,1:7,2:2,3:8"])
dnl The rxqs should be on the requested cores.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 3
p1 1 0 7
p1 2 0 2
p1 3 0 8
])
# change rxq assignment algorithm
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=group])
dnl The rxqs should be on the requested cores.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 3
p1 1 0 7
p1 2 0 2
p1 3 0 8
])
# try to pin & non-isolate
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-isolate=false])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "pmd-rxq-affinity does not isolate PMD core"])
# should not impact - all rxqs are still pinned
dnl The rxqs should be on the requested cores.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 3
p1 1 0 7
p1 2 0 2
p1 3 0 8
])
# remove some pinning - see if non-isolate pmd are used for ovs rxq assignment of other rxqs
AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity='0:1'])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=6])
dnl We explicitly requested core 1 for queue 0. Core 1 is not isolated so it is
dnl use for other rxqs.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 1
p1 1 0 2
p1 2 0 1
p1 3 0 2
])
# change to algorithm that does not support pin & non-isolate
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "pmd-rxq-isolate can only be set false when using pmd-rxq-assign=group"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "pmd-rxq-affinity isolates PMD core"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using cycles algorithm"])
dnl We explicitly requested core 1 for queue 0. Core 1 becomes isolated and
dnl every other queue goes to core 2.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 1
p1 1 0 2
p1 2 0 2
p1 3 0 2
])
# change rxq assignment algorithm to one that support pin & non-isolate
TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=group])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "pmd-rxq-affinity does not isolate PMD core"])
OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"])
#dnl We explicitly requested core 1 for queue 0. Core 1 becomes isolated and
#dnl every other queue goes to core 2.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 1
p1 1 0 2
p1 2 0 1
p1 3 0 2
])
OVS_VSWITCHD_STOP(["/cannot be pinned with port/d
/pmd-rxq-isolate can only be set false when using pmd-rxq-assign=group/d"])
AT_CLEANUP
AT_SETUP([PMD - rxq affinity - NUMA])
OVS_VSWITCHD_START(
[], [], [], [--dummy-numa 0,0,0,1,1,8,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=7e])
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 options:numa_id=0 other_config:pmd-rxq-affinity="0:1,1:2"])
dnl The rxqs should be on the requested cores.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 1
p1 1 0 2
])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
dnl We moved the queues to different contiguous numa node. Expecting threads on
dnl NUMA node 1 to be created.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3
p1 1 1 4
])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"])
dnl We moved the queues to different non-contiguous numa node. Expecting threads on
dnl NUMA node 8 to be created.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 8 5
p1 1 8 6
])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
dnl Queues splitted between contiguous NUMA nodes.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3
p1 1 0 1
])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:1"])
dnl Queues splitted between non-contiguous NUMA nodes.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 8 5
p1 1 0 1
])
AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity])
dnl We removed the rxq-affinity request. dpif-netdev should assign queues
dnl in a round robin fashion. We just make sure that every rxq is being
dnl polled again.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 1,2 -d ' ' | sort], [0], [dnl
p1 0
p1 1
])
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity='0:3'])
dnl We explicitly requesting NUMA node 1 for queue 0.
dnl Queue 1 should be polled by thread from NUMA node 0.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 1,2,3 -d ' '], [0], [dnl
p1 0 1
p1 1 0
])
OVS_VSWITCHD_STOP
AT_CLEANUP
dpif-netdev: Fix crash in dpif_netdev_execute(). dp_netdev_get_pmd() is allowed to return NULL (even if we call it with NON_PMD_CORE_ID) for different reasons: * Since we use RCU to protect pmd threads, it is possible that ovs_refcount_try_ref_rcu() has failed. * During reconfiguration we destroy every thread. This commit makes sure that we always handle the case when dp_netdev_get_pmd() returns NULL without crashing (the change in dpif_netdev_run() doesn't fix anything, because everything is happening in the main thread, but it's better to honor the interface in case we change our threading model). This actually fixes a pretty serious crash that happens if dpif_netdev_execute() is called from a non pmd thread while reconfiguration is happening. It can be triggered by enabling bfd (because it's handled by the monitor thread, which is a non pmd thread) on an interface and changing something that requires datapath reconfiguration (n_rxq, pmd-cpu-mask, mtu). A testcase that reproduces the race condition is included. This is a possible backtrace of the segfault: #0 0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320, packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at ../lib/dpif-netdev.c:4357 #1 0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320, batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658, actions_len=8, dp_execute_action=0x60c7a5 <dp_execute_cb>) at ../lib/odp-execute.c:538 #2 0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0, packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70, actions=0x7f1dd2d2a658, actions_len=8, now=44965873) at ../lib/dpif-netdev.c:4577 #3 0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70, execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624 #4 0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654 #5 0x0000000000610a30 in dpif_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268 #6 0x000000000061098c in dpif_execute (dpif=0x2b67b70, execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233 #7 0x00000000005b9008 in ofproto_dpif_execute_actions__ (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, indentation=0, depth=0, resubmits=0, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806 #8 0x00000000005b907a in ofproto_dpif_execute_actions (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3823 #9 0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-xlate.c:5792 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287 #12 0x00000000005c3d9b in monitor_run () at ../ofproto/ofproto-dpif-monitor.c:227 #13 0x00000000005c3cab in monitor_main (args=0x0) at ../ofproto/ofproto-dpif-monitor.c:189 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at ../lib/ovs-thread.c:342 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at pthread_create.c:333 #16 0x00007f1dd6e1d20d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Ben Pfaff <blp@ovn.org>
2016-10-04 14:53:31 -07:00
AT_SETUP([PMD - monitor threads])
OVS_VSWITCHD_START(
[], [], [], [--dummy-numa 0,0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
dnl The two devices are connected together externally using net.sock
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=1 options:pstream=punix:$OVS_RUNDIR/net.sock])
AT_CHECK([ovs-vsctl add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=1 options:stream=unix:$OVS_RUNDIR/net.sock])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
p2 0 0 0
])
dnl Enable bfd with a very aggressive interval. This will make the monitor very
dnl busy, and uncover race conditions with the main thread.
AT_CHECK([ovs-vsctl set Interface p1 bfd:enable=true bfd:min_rx=1 bfd:min_tx=1])
AT_CHECK([ovs-vsctl set Interface p2 bfd:enable=true bfd:min_rx=1 bfd:min_tx=1])
AT_CHECK([ovs-vsctl wait-until Interface p1 bfd_status:forwarding=true \
-- wait-until Interface p2 bfd_status:forwarding=true])
dpif-netdev: Fix crash in dpif_netdev_execute(). dp_netdev_get_pmd() is allowed to return NULL (even if we call it with NON_PMD_CORE_ID) for different reasons: * Since we use RCU to protect pmd threads, it is possible that ovs_refcount_try_ref_rcu() has failed. * During reconfiguration we destroy every thread. This commit makes sure that we always handle the case when dp_netdev_get_pmd() returns NULL without crashing (the change in dpif_netdev_run() doesn't fix anything, because everything is happening in the main thread, but it's better to honor the interface in case we change our threading model). This actually fixes a pretty serious crash that happens if dpif_netdev_execute() is called from a non pmd thread while reconfiguration is happening. It can be triggered by enabling bfd (because it's handled by the monitor thread, which is a non pmd thread) on an interface and changing something that requires datapath reconfiguration (n_rxq, pmd-cpu-mask, mtu). A testcase that reproduces the race condition is included. This is a possible backtrace of the segfault: #0 0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320, packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at ../lib/dpif-netdev.c:4357 #1 0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320, batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658, actions_len=8, dp_execute_action=0x60c7a5 <dp_execute_cb>) at ../lib/odp-execute.c:538 #2 0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0, packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70, actions=0x7f1dd2d2a658, actions_len=8, now=44965873) at ../lib/dpif-netdev.c:4577 #3 0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70, execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624 #4 0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654 #5 0x0000000000610a30 in dpif_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268 #6 0x000000000061098c in dpif_execute (dpif=0x2b67b70, execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233 #7 0x00000000005b9008 in ofproto_dpif_execute_actions__ (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, indentation=0, depth=0, resubmits=0, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806 #8 0x00000000005b907a in ofproto_dpif_execute_actions (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3823 #9 0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-xlate.c:5792 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287 #12 0x00000000005c3d9b in monitor_run () at ../ofproto/ofproto-dpif-monitor.c:227 #13 0x00000000005c3cab in monitor_main (args=0x0) at ../ofproto/ofproto-dpif-monitor.c:189 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at ../lib/ovs-thread.c:342 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at pthread_create.c:333 #16 0x00007f1dd6e1d20d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Ben Pfaff <blp@ovn.org>
2016-10-04 14:53:31 -07:00
dnl Trigger reconfiguration of the datapath
AT_CHECK([ovs-vsctl set Interface p1 options:n_rxq=2])
AT_CHECK([ovs-vsctl set Interface p2 options:n_rxq=2])
dnl Make sure that reconfiguration succeded
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
p1 1 0 0
p2 0 0 0
p2 1 0 0
])
dpif-netdev: Centralized threads and queues handling code. Currently we have three different code paths that deal with pmd threads and queues, in response to different input 1. When a port is added 2. When a port is deleted 3. When the cpumask changes or a port must be reconfigured. 1. and 2. are carefully written to minimize disruption to the running datapath, while 3. brings down all the threads reconfigure all the ports and restarts everything. This commit removes the three separate code paths by introducing the reconfigure_datapath() function, that takes care of adapting the pmd threads and queues to the current datapath configuration, no matter how we got there. This aims at simplifying maintenance and introduces a long overdue improvement: port reconfiguration (can happen quite frequently for dpdkvhost ports) is now done without shutting down the whole datapath, but just by temporarily removing the port that needs to be reconfigured (while the rest of the datapath is running). We now also recompute the rxq scheduling from scratch every time a port is added of deleted. This means that the queues will be more balanced, especially when dealing with explicit rxq-affinity from the user (without shutting down the threads and restarting them), but it also means that adding or deleting a port might cause existing queues to be moved between pmd threads. This negative effect can be avoided by taking into account the existing distribution when computing the new scheduling, but I considered code clarity and fast reconfiguration more important than optimizing port addition or removal (a port is added and removed only once, but can be reconfigured many times) Lastly, this commit moves the pmd threads state away from ovs-numa. Now the pmd threads state is kept only in dpif-netdev. Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Acked-by: Ilya Maximets <i.maximets@samsung.com>
2016-11-15 15:40:49 -08:00
OVS_VSWITCHD_STOP
dpif-netdev: Fix crash in dpif_netdev_execute(). dp_netdev_get_pmd() is allowed to return NULL (even if we call it with NON_PMD_CORE_ID) for different reasons: * Since we use RCU to protect pmd threads, it is possible that ovs_refcount_try_ref_rcu() has failed. * During reconfiguration we destroy every thread. This commit makes sure that we always handle the case when dp_netdev_get_pmd() returns NULL without crashing (the change in dpif_netdev_run() doesn't fix anything, because everything is happening in the main thread, but it's better to honor the interface in case we change our threading model). This actually fixes a pretty serious crash that happens if dpif_netdev_execute() is called from a non pmd thread while reconfiguration is happening. It can be triggered by enabling bfd (because it's handled by the monitor thread, which is a non pmd thread) on an interface and changing something that requires datapath reconfiguration (n_rxq, pmd-cpu-mask, mtu). A testcase that reproduces the race condition is included. This is a possible backtrace of the segfault: #0 0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320, packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at ../lib/dpif-netdev.c:4357 #1 0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320, batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658, actions_len=8, dp_execute_action=0x60c7a5 <dp_execute_cb>) at ../lib/odp-execute.c:538 #2 0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0, packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70, actions=0x7f1dd2d2a658, actions_len=8, now=44965873) at ../lib/dpif-netdev.c:4577 #3 0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70, execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624 #4 0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654 #5 0x0000000000610a30 in dpif_operate (dpif=0x2b67b70, ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268 #6 0x000000000061098c in dpif_execute (dpif=0x2b67b70, execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233 #7 0x00000000005b9008 in ofproto_dpif_execute_actions__ (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, indentation=0, depth=0, resubmits=0, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806 #8 0x00000000005b907a in ofproto_dpif_execute_actions (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70, rule=0x0, ofpacts=0x7f1dd2d2b100, ofpacts_len=16, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3823 #9 0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-xlate.c:5792 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380, oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287 #12 0x00000000005c3d9b in monitor_run () at ../ofproto/ofproto-dpif-monitor.c:227 #13 0x00000000005c3cab in monitor_main (args=0x0) at ../ofproto/ofproto-dpif-monitor.c:189 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at ../lib/ovs-thread.c:342 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at pthread_create.c:333 #16 0x00007f1dd6e1d20d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Ben Pfaff <blp@ovn.org>
2016-10-04 14:53:31 -07:00
AT_CLEANUP
AT_SETUP([PMD - dpctl])
OVS_VSWITCHD_START(
[del-br br0], [], [], [--dummy-numa 0,0])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-appctl dpctl/add-dp dummy@dp0])
AT_CHECK([ovs-appctl dpctl/add-if dummy@dp0 p1,type=dummy-pmd])
AT_CHECK([ovs-appctl dpctl/add-if dummy@dp0 p2,type=dummy])
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show dp0 | parse_pmd_rxq_show], [0], [dnl
p1 0 0 0
])
AT_CHECK([ovs-appctl dpctl/show dummy@dp0], [0], [dnl
dummy@dp0:
lookups: hit:0 missed:0 lost:0
flows: 0
port 0: dp0 (dummy-internal)
port 1: p1 (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0)
port 2: p2 (dummy)
])
AT_CHECK([ovs-appctl dpctl/add-flow dummy@dp0 'in_port(1),eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:02),eth_type(0x1234)' 2], [0], [dnl
])
AT_CHECK([ovs-appctl dpctl/dump-flows dummy@dp0 | sort], [0], [dnl
flow-dump from pmd on cpu core: 0
flow-dump from the main thread:
recirc_id(0),in_port(1),eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:02),eth_type(0x1234), packets:0, bytes:0, used:never, actions:2
recirc_id(0),in_port(1),eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:02),eth_type(0x1234), packets:0, bytes:0, used:never, actions:2
])
dnl Check pmd filtering option.
AT_CHECK([ovs-appctl dpctl/dump-flows dummy@dp0 pmd=0], [0], [dnl
recirc_id(0),in_port(1),eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:02),eth_type(0x1234), packets:0, bytes:0, used:never, actions:2
])
AT_CHECK([ovs-appctl dpctl/dump-flows dummy@dp0 pmd=-1], [0], [dnl
recirc_id(0),in_port(1),eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:02),eth_type(0x1234), packets:0, bytes:0, used:never, actions:2
])
AT_CHECK([ovs-appctl dpctl/del-flow dummy@dp0 'in_port(1),eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:02),eth_type(0x1234)'], [0], [dnl
])
AT_CHECK([ovs-appctl dpctl/dump-flows dummy@dp0], [0], [dnl
])
AT_CHECK([ovs-appctl dpctl/del-dp dummy@dp0], [0], [dnl
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - dpif configuration])
OVS_VSWITCHD_START([], [], [], [--dummy-numa 0,0])
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd])
AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_scalar], [0], [dnl
DPIF implementation set to dpif_scalar.
])
AT_CHECK([ovs-vsctl show], [], [stdout])
AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-get | grep "dpif_scalar"], [], [dnl
dpif_scalar (pmds: 0)
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - dpcls configuration])
OVS_VSWITCHD_START([], [], [], [--dummy-numa 0,0])
AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd])
AT_CHECK([ovs-vsctl show], [], [stdout])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set autovalidator 3], [0], [dnl
Lookup priority change affected 0 dpcls ports and 0 subtables.
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep autovalidator], [], [dnl
autovalidator (Use count: 0, Priority: 3)
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set generic 4], [0], [dnl
Lookup priority change affected 0 dpcls ports and 0 subtables.
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep generic], [], [dnl
generic (Use count: 0, Priority: 4)
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set generic 8], [0], [dnl
Lookup priority change affected 0 dpcls ports and 0 subtables.
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep generic], [], [dnl
generic (Use count: 0, Priority: 8)
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set autovalidator 8], [0], [dnl
Lookup priority change affected 0 dpcls ports and 0 subtables.
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep autovalidator], [], [dnl
autovalidator (Use count: 0, Priority: 8)
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set generic 0], [0], [dnl
Lookup priority change affected 0 dpcls ports and 0 subtables.
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep generic], [], [dnl
generic (Use count: 0, Priority: 0)
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set generic 255], [0], [dnl
Lookup priority change affected 0 dpcls ports and 0 subtables.
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep generic], [], [dnl
generic (Use count: 0, Priority: 255)
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set generic -1], [2],
[], [dnl
error converting priority, use integer in range 0-255
ovs-appctl: ovs-vswitchd: server returned an error
])
AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set generic 300], [2],
[], [dnl
error converting priority, use integer in range 0-255
ovs-appctl: ovs-vswitchd: server returned an error
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - pmd sleep])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8 options:numa_id=1], [], [], [--dummy-numa 0,0,0,1,1,8,8])
dnl Check default
CHECK_DP_SLEEP_MAX([0], [])
CHECK_PMD_SLEEP_MAX([0], [0], [0], [])
CHECK_PMD_SLEEP_MAX([1], [3], [0], [])
CHECK_PMD_SLEEP_MAX([8], [5], [0], [])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 0 us
pmd thread numa_id 0 core_id 0:
max sleep: 0 us
pmd thread numa_id 1 core_id 3:
max sleep: 0 us
pmd thread numa_id 8 core_id 5:
max sleep: 0 us
])
dnl Check low value max sleep
get_log_next_line_num
AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="1"])
CHECK_DP_SLEEP_MAX([1], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [1], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [1], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [1], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 1 us
pmd thread numa_id 0 core_id 0:
max sleep: 1 us
pmd thread numa_id 1 core_id 3:
max sleep: 1 us
pmd thread numa_id 8 core_id 5:
max sleep: 1 us
])
dnl Check high value max sleep
get_log_next_line_num
AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10000"])
CHECK_DP_SLEEP_MAX([10000], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [10000], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [10000], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [10000], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 10000 us
pmd thread numa_id 0 core_id 0:
max sleep: 10000 us
pmd thread numa_id 1 core_id 3:
max sleep: 10000 us
pmd thread numa_id 8 core_id 5:
max sleep: 10000 us
])
dnl Check setting max sleep to zero
get_log_next_line_num
AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="0"])
CHECK_DP_SLEEP_MAX([0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 0 us
pmd thread numa_id 0 core_id 0:
max sleep: 0 us
pmd thread numa_id 1 core_id 3:
max sleep: 0 us
pmd thread numa_id 8 core_id 5:
max sleep: 0 us
])
dnl Check above high value max sleep
get_log_next_line_num
AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10001"])
CHECK_DP_SLEEP_MAX([10000], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [10000], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [10000], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [10000], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 10000 us
pmd thread numa_id 0 core_id 0:
max sleep: 10000 us
pmd thread numa_id 1 core_id 3:
max sleep: 10000 us
pmd thread numa_id 8 core_id 5:
max sleep: 10000 us
])
dnl Check rounding
get_log_next_line_num
AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="490"])
CHECK_DP_SLEEP_MAX([490], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [490], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [490], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [490], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 490 us
pmd thread numa_id 0 core_id 0:
max sleep: 490 us
pmd thread numa_id 1 core_id 3:
max sleep: 490 us
pmd thread numa_id 8 core_id 5:
max sleep: 490 us
])
dnl Check rounding
get_log_next_line_num
AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="499"])
CHECK_DP_SLEEP_MAX([499], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [499], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [499], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [499], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 499 us
pmd thread numa_id 0 core_id 0:
max sleep: 499 us
pmd thread numa_id 1 core_id 3:
max sleep: 499 us
pmd thread numa_id 8 core_id 5:
max sleep: 499 us
])
OVS_VSWITCHD_STOP
AT_CLEANUP
AT_SETUP([PMD - per PMD sleep])
OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8 options:numa_id=1],
[], [], [--dummy-numa 0,0,0,1,1,8,8])
dnl Check system default.
CHECK_DP_SLEEP_MAX([0], [])
CHECK_PMD_SLEEP_MAX([0], [0], [0], [])
CHECK_PMD_SLEEP_MAX([1], [3], [0], [])
CHECK_PMD_SLEEP_MAX([8], [5], [0], [])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 0 us
pmd thread numa_id 0 core_id 0:
max sleep: 0 us
pmd thread numa_id 1 core_id 3:
max sleep: 0 us
pmd thread numa_id 8 core_id 5:
max sleep: 0 us
])
dnl Only per PMD.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=3:300,0:100,5:400])
CHECK_DP_SLEEP_MAX([0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [400], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 0 us
pmd thread numa_id 0 core_id 0:
max sleep: 100 us
pmd thread numa_id 1 core_id 3:
max sleep: 300 us
pmd thread numa_id 8 core_id 5:
max sleep: 400 us
])
dnl Mix of not used default and per-PMD.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,3:300,0:100,5:200])
CHECK_DP_SLEEP_MAX([50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 50 us
pmd thread numa_id 0 core_id 0:
max sleep: 100 us
pmd thread numa_id 1 core_id 3:
max sleep: 300 us
pmd thread numa_id 8 core_id 5:
max sleep: 200 us
])
dnl Remove a per-pmd entry and use default.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,3:300])
CHECK_DP_SLEEP_MAX([50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [50], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 50 us
pmd thread numa_id 0 core_id 0:
max sleep: 50 us
pmd thread numa_id 1 core_id 3:
max sleep: 300 us
pmd thread numa_id 8 core_id 5:
max sleep: 50 us
])
dnl Mix and change values.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=3:400,200])
CHECK_DP_SLEEP_MAX([200], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [200], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [400], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 200 us
pmd thread numa_id 0 core_id 0:
max sleep: 200 us
pmd thread numa_id 1 core_id 3:
max sleep: 400 us
pmd thread numa_id 8 core_id 5:
max sleep: 200 us
])
dnl Add values for pmds that don't exist yet.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=2:600,50,3:300,0:100,6:400,5:200])
CHECK_DP_SLEEP_MAX([50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 50 us
pmd thread numa_id 0 core_id 0:
max sleep: 100 us
pmd thread numa_id 1 core_id 3:
max sleep: 300 us
pmd thread numa_id 8 core_id 5:
max sleep: 200 us
])
dnl Add more cores.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=7f])
CHECK_PMD_SLEEP_MAX([0], [1], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [2], [600], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [4], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [6],[400], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 50 us
pmd thread numa_id 0 core_id 0:
max sleep: 100 us
pmd thread numa_id 0 core_id 1:
max sleep: 50 us
pmd thread numa_id 0 core_id 2:
max sleep: 600 us
pmd thread numa_id 1 core_id 3:
max sleep: 300 us
pmd thread numa_id 1 core_id 4:
max sleep: 50 us
pmd thread numa_id 8 core_id 5:
max sleep: 200 us
pmd thread numa_id 8 core_id 6:
max sleep: 400 us
])
dnl Go back to just a global value.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=90])
CHECK_DP_SLEEP_MAX([90], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [90], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [1], [90], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [2], [90], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [90], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [4], [90], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [90], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [6], [90], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 90 us
pmd thread numa_id 0 core_id 0:
max sleep: 90 us
pmd thread numa_id 0 core_id 1:
max sleep: 90 us
pmd thread numa_id 0 core_id 2:
max sleep: 90 us
pmd thread numa_id 1 core_id 3:
max sleep: 90 us
pmd thread numa_id 1 core_id 4:
max sleep: 90 us
pmd thread numa_id 8 core_id 5:
max sleep: 90 us
pmd thread numa_id 8 core_id 6:
max sleep: 90 us
])
dnl Try invalid value.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=qwe])
CHECK_DP_SLEEP_MAX([0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [1], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [4], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [6], [0], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 0 us
pmd thread numa_id 0 core_id 0:
max sleep: 0 us
pmd thread numa_id 0 core_id 1:
max sleep: 0 us
pmd thread numa_id 0 core_id 2:
max sleep: 0 us
pmd thread numa_id 1 core_id 3:
max sleep: 0 us
pmd thread numa_id 1 core_id 4:
max sleep: 0 us
pmd thread numa_id 8 core_id 5:
max sleep: 0 us
pmd thread numa_id 8 core_id 6:
max sleep: 0 us
])
dnl Try invalid key:value.
get_log_next_line_num
AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,1:qwe,2:0])
CHECK_DP_SLEEP_MAX([50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [1], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [4], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [50], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [6], [50], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 50 us
pmd thread numa_id 0 core_id 0:
max sleep: 50 us
pmd thread numa_id 0 core_id 1:
max sleep: 50 us
pmd thread numa_id 0 core_id 2:
max sleep: 0 us
pmd thread numa_id 1 core_id 3:
max sleep: 50 us
pmd thread numa_id 1 core_id 4:
max sleep: 50 us
pmd thread numa_id 8 core_id 5:
max sleep: 50 us
pmd thread numa_id 8 core_id 6:
max sleep: 50 us
])
dnl Remove config.
get_log_next_line_num
AT_CHECK([ovs-vsctl remove Open_vSwitch . other_config pmd-sleep-max])
CHECK_DP_SLEEP_MAX([0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [1], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([1], [4], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM])
CHECK_PMD_SLEEP_MAX([8], [6], [0], [+$LINENUM])
AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl
Default max sleep: 0 us
pmd thread numa_id 0 core_id 0:
max sleep: 0 us
pmd thread numa_id 0 core_id 1:
max sleep: 0 us
pmd thread numa_id 0 core_id 2:
max sleep: 0 us
pmd thread numa_id 1 core_id 3:
max sleep: 0 us
pmd thread numa_id 1 core_id 4:
max sleep: 0 us
pmd thread numa_id 8 core_id 5:
max sleep: 0 us
pmd thread numa_id 8 core_id 6:
max sleep: 0 us
])
OVS_VSWITCHD_STOP
AT_CLEANUP
dpif-netdev: Fix dpif_netdev_flow_put. OVS allows overlapping megaflows, as long as the actions of these megaflows are equal. However, the current implementation of action modification relies on flow_lookup instead of UFID, this could result in looking up a wrong megaflow and make the ukeys and megaflows inconsistent. Just like the test case in the patch, at first we have a rule with the prefix: 10.1.2.0/24 And we will get a megaflow with prefixes 10.1.2.2/24 when a packet with IP 10.1.2.2 is received. Then suppose we change the rule into 10.1.0.0/16. OVS prefers to keep the 10.1.2.2/24 megaflow and just changes its action instead of extending the prefix into 10.1.2.2/16. Then suppose we have a 10.1.0.2 packet, since it misses the megaflow, this time, we will have an overlapping megaflow with the right prefix: 10.1.0.2/16 Now we have two megaflows: 10.1.2.2/24 10.1.0.2/16 Last, suppose we have changed the ruleset again. The revalidator this time still decides to change the actions of both megaflows instead of deleting them. The dpif_netdev_flow_put will search the megaflow to modify with unmasked keys, however it might lookup the wrong megaflow as the key 10.1.2.2 matches both 10.1.2.2/24 and 10.1.0.2/16! This patch changes the megaflow lookup code in modification path into relying the UFID to find the correct megaflow instead of key lookup. Falling back to a classifier lookup in case where UFID was not provided in order to support cases where UFID was not generated from the flow data during the flow addition. Fixes: beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline") Signed-off-by: Peng He <hepeng.0320@bytedance.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-08-14 02:37:50 +00:00
AT_SETUP([PMD - revalidator modify overlapping flows])
OVS_VSWITCHD_START(
[add-port br0 p1 \
-- set bridge br0 datapath-type=dummy \
-- set interface p1 type=dummy-pmd \
-- add-port br0 p2 \
-- set interface p2 type=dummy-pmd
], [], [], [DUMMY_NUMA])
dnl Add one OpenFlow rule and generate a megaflow.
AT_CHECK([ovs-ofctl add-flow br0 'table=0,in_port=p1,ip,nw_dst=10.1.2.0/24,actions=p2'])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)'])
OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//'], [
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:never, actions:2])
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)'])
dnl Replace OpenFlow rules, trigger the revalidation.
AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=ct(commit)' | dnl
ovs-ofctl --bundle replace-flows br0 -])
AT_CHECK([ovs-appctl revalidator/wait])
dnl Prevent flows from expiring.
AT_CHECK([ovs-appctl time/stop])
dpif-netdev: Fix dpif_netdev_flow_put. OVS allows overlapping megaflows, as long as the actions of these megaflows are equal. However, the current implementation of action modification relies on flow_lookup instead of UFID, this could result in looking up a wrong megaflow and make the ukeys and megaflows inconsistent. Just like the test case in the patch, at first we have a rule with the prefix: 10.1.2.0/24 And we will get a megaflow with prefixes 10.1.2.2/24 when a packet with IP 10.1.2.2 is received. Then suppose we change the rule into 10.1.0.0/16. OVS prefers to keep the 10.1.2.2/24 megaflow and just changes its action instead of extending the prefix into 10.1.2.2/16. Then suppose we have a 10.1.0.2 packet, since it misses the megaflow, this time, we will have an overlapping megaflow with the right prefix: 10.1.0.2/16 Now we have two megaflows: 10.1.2.2/24 10.1.0.2/16 Last, suppose we have changed the ruleset again. The revalidator this time still decides to change the actions of both megaflows instead of deleting them. The dpif_netdev_flow_put will search the megaflow to modify with unmasked keys, however it might lookup the wrong megaflow as the key 10.1.2.2 matches both 10.1.2.2/24 and 10.1.0.2/16! This patch changes the megaflow lookup code in modification path into relying the UFID to find the correct megaflow instead of key lookup. Falling back to a classifier lookup in case where UFID was not provided in order to support cases where UFID was not generated from the flow data during the flow addition. Fixes: beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline") Signed-off-by: Peng He <hepeng.0320@bytedance.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-08-14 02:37:50 +00:00
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)'])
OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//' | strip_xout_keep_actions], [
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.0.2/255.255.0.0,frag=no), packets:0, bytes:0, used:never, actions:ct(commit)
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:0.0s, actions:ct(commit)])
dnl Send more 10.1.0.2 to make 10.1.0.0/16 tuple prepend 10.1.2.0/24 tuple in the pvector of subtables.
for i in $(seq 0 256); do
AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)'])
done
dnl Warp time enough to trigger subtable optimization.
AT_CHECK([ovs-appctl time/warp 500 2000], [0], [ignore])
dpif-netdev: Fix dpif_netdev_flow_put. OVS allows overlapping megaflows, as long as the actions of these megaflows are equal. However, the current implementation of action modification relies on flow_lookup instead of UFID, this could result in looking up a wrong megaflow and make the ukeys and megaflows inconsistent. Just like the test case in the patch, at first we have a rule with the prefix: 10.1.2.0/24 And we will get a megaflow with prefixes 10.1.2.2/24 when a packet with IP 10.1.2.2 is received. Then suppose we change the rule into 10.1.0.0/16. OVS prefers to keep the 10.1.2.2/24 megaflow and just changes its action instead of extending the prefix into 10.1.2.2/16. Then suppose we have a 10.1.0.2 packet, since it misses the megaflow, this time, we will have an overlapping megaflow with the right prefix: 10.1.0.2/16 Now we have two megaflows: 10.1.2.2/24 10.1.0.2/16 Last, suppose we have changed the ruleset again. The revalidator this time still decides to change the actions of both megaflows instead of deleting them. The dpif_netdev_flow_put will search the megaflow to modify with unmasked keys, however it might lookup the wrong megaflow as the key 10.1.2.2 matches both 10.1.2.2/24 and 10.1.0.2/16! This patch changes the megaflow lookup code in modification path into relying the UFID to find the correct megaflow instead of key lookup. Falling back to a classifier lookup in case where UFID was not provided in order to support cases where UFID was not generated from the flow data during the flow addition. Fixes: beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline") Signed-off-by: Peng He <hepeng.0320@bytedance.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-08-14 02:37:50 +00:00
AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=p2' | dnl
ovs-ofctl --bundle replace-flows br0 -])
AT_CHECK([ovs-appctl revalidator/wait])
AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//' | strip_xout_keep_actions], [0], [
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.0.2/255.255.0.0,frag=no), packets:0, bytes:0, used:0.0s, actions:2
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:0.0s, actions:2
])
OVS_VSWITCHD_STOP
AT_CLEANUP