2021-07-09 15:58:15 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
|
|
|
|
* Copyright (c) 2019, 2020, 2021 Intel Corporation.
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at:
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef DPIF_NETDEV_PRIVATE_FLOW_H
|
|
|
|
#define DPIF_NETDEV_PRIVATE_FLOW_H 1
|
|
|
|
|
|
|
|
#include "dpif.h"
|
|
|
|
#include "dpif-netdev-private-dpcls.h"
|
|
|
|
|
|
|
|
#include <stdbool.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
#include "cmap.h"
|
|
|
|
#include "openvswitch/thread.h"
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Contained by struct dp_netdev_flow's 'stats' member. */
|
|
|
|
struct dp_netdev_flow_stats {
|
|
|
|
atomic_llong used; /* Last used time, in monotonic msecs. */
|
|
|
|
atomic_ullong packet_count; /* Number of packets matched. */
|
|
|
|
atomic_ullong byte_count; /* Number of bytes matched. */
|
|
|
|
atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Contained by struct dp_netdev_flow's 'last_attrs' member. */
|
|
|
|
struct dp_netdev_flow_attrs {
|
|
|
|
atomic_bool offloaded; /* True if flow is offloaded to HW. */
|
|
|
|
ATOMIC(const char *) dp_layer; /* DP layer the flow is handled in. */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Thread-safety
|
|
|
|
* =============
|
|
|
|
*
|
|
|
|
* Except near the beginning or ending of its lifespan, rule 'rule' belongs to
|
|
|
|
* its pmd thread's classifier. The text below calls this classifier 'cls'.
|
|
|
|
*
|
|
|
|
* Motivation
|
|
|
|
* ----------
|
|
|
|
*
|
|
|
|
* The thread safety rules described here for "struct dp_netdev_flow" are
|
|
|
|
* motivated by two goals:
|
|
|
|
*
|
|
|
|
* - Prevent threads that read members of "struct dp_netdev_flow" from
|
|
|
|
* reading bad data due to changes by some thread concurrently modifying
|
|
|
|
* those members.
|
|
|
|
*
|
|
|
|
* - Prevent two threads making changes to members of a given "struct
|
|
|
|
* dp_netdev_flow" from interfering with each other.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Rules
|
|
|
|
* -----
|
|
|
|
*
|
|
|
|
* A flow 'flow' may be accessed without a risk of being freed during an RCU
|
|
|
|
* grace period. Code that needs to hold onto a flow for a while
|
|
|
|
* should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
|
|
|
|
*
|
|
|
|
* 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
|
|
|
|
* flow from being deleted from 'cls' and it doesn't protect members of 'flow'
|
|
|
|
* from modification.
|
|
|
|
*
|
|
|
|
* Some members, marked 'const', are immutable. Accessing other members
|
|
|
|
* requires synchronization, as noted in more detail below.
|
|
|
|
*/
|
|
|
|
struct dp_netdev_flow {
|
|
|
|
const struct flow flow; /* Unmasked flow that created this entry. */
|
|
|
|
/* Hash table index by unmasked flow. */
|
|
|
|
const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
|
|
|
|
/* 'flow_table'. */
|
dpif-netdev: Forwarding optimization for flows with a simple match.
There are cases where users might want simple forwarding or drop rules
for all packets received from a specific port, e.g ::
"in_port=1,actions=2"
"in_port=2,actions=IN_PORT"
"in_port=3,vlan_tci=0x1234/0x1fff,actions=drop"
"in_port=4,actions=push_vlan:0x8100,set_field:4196->vlan_vid,output:3"
There are also cases where complex OpenFlow rules can be simplified
down to datapath flows with very simple match criteria.
In theory, for very simple forwarding, OVS doesn't need to parse
packets at all in order to follow these rules. "Simple match" lookup
optimization is intended to speed up packet forwarding in these cases.
Design:
Due to various implementation constraints userspace datapath has
following flow fields always in exact match (i.e. it's required to
match at least these fields of a packet even if the OF rule doesn't
need that):
- recirc_id
- in_port
- packet_type
- dl_type
- vlan_tci (CFI + VID) - in most cases
- nw_frag - for ip packets
Not all of these fields are related to packet itself. We already
know the current 'recirc_id' and the 'in_port' before starting the
packet processing. It also seems safe to assume that we're working
with Ethernet packets. So, for the simple OF rule we need to match
only on 'dl_type', 'vlan_tci' and 'nw_frag'.
'in_port', 'dl_type', 'nw_frag' and 13 bits of 'vlan_tci' can be
combined in a single 64bit integer (mark) that can be used as a
hash in hash map. We are using only VID and CFI form the 'vlan_tci',
flows that need to match on PCP will not qualify for the optimization.
Workaround for matching on non-existence of vlan updated to match on
CFI and VID only in order to qualify for the optimization. CFI is
always set by OVS if vlan is present in a packet, so there is no need
to match on PCP in this case. 'nw_frag' takes 2 bits of PCP inside
the simple match mark.
New per-PMD flow table 'simple_match_table' introduced to store
simple match flows only. 'dp_netdev_flow_add' adds flow to the
usual 'flow_table' and to the 'simple_match_table' if the flow
meets following constraints:
- 'recirc_id' in flow match is 0.
- 'packet_type' in flow match is Ethernet.
- Flow wildcards contains only minimal set of non-wildcarded fields
(listed above).
If the number of flows for current 'in_port' in a regular 'flow_table'
equals number of flows for current 'in_port' in a 'simple_match_table',
we may use simple match optimization, because all the flows we have
are simple match flows. This means that we only need to parse
'dl_type', 'vlan_tci' and 'nw_frag' to perform packet matching.
Now we make the unique flow mark from the 'in_port', 'dl_type',
'nw_frag' and 'vlan_tci' and looking for it in the 'simple_match_table'.
On successful lookup we don't need to run full 'miniflow_extract()'.
Unsuccessful lookup technically means that we have no suitable flow
in the datapath and upcall will be required. So, in this case EMC and
SMC lookups are disabled. We may optimize this path in the future by
bypassing the dpcls lookup too.
Performance improvement of this solution on a 'simple match' flows
should be comparable with partial HW offloading, because it parses same
packet fields and uses similar flow lookup scheme.
However, unlike partial HW offloading, it works for all port types
including virtual ones.
Performance results when compared to EMC:
Test setup:
virtio-user OVS virtio-user
Testpmd1 ------------> pmd1 ------------> Testpmd2
(txonly) x<------ pmd2 <------------ (mac swap)
Single stream of 64byte packets. Actions:
in_port=vhost0,actions=vhost1
in_port=vhost1,actions=vhost0
Stats collected from pmd1 and pmd2, so there are 2 scenarios:
Virt-to-Virt : Testpmd1 ------> pmd1 ------> Testpmd2.
Virt-to-NoCopy : Testpmd2 ------> pmd2 --->x Testpmd1.
Here the packet sent from pmd2 to Testpmd1 is always dropped, because
the virtqueue is full since Testpmd1 is in txonly mode and doesn't
receive any packets. This should be closer to the performance of a
VM-to-Phy scenario.
Test performed on machine with Intel Xeon CPU E5-2690 v4 @ 2.60GHz.
Table below represents improvement in throughput when compared to EMC.
+----------------+------------------------+------------------------+
| | Default (-g -O2) | "-Ofast -march=native" |
| Scenario +------------+-----------+------------+-----------+
| | GCC | Clang | GCC | Clang |
+----------------+------------+-----------+------------+-----------+
| Virt-to-Virt | +18.9% | +25.5% | +10.8% | +16.7% |
| Virt-to-NoCopy | +24.3% | +33.7% | +14.9% | +22.0% |
+----------------+------------+-----------+------------+-----------+
For Phy-to-Phy case performance improvement should be even higher, but
it's not the main use-case for this functionality. Performance
difference for the non-simple flows is within a margin of error.
Acked-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-09 14:57:52 +02:00
|
|
|
const struct cmap_node simple_match_node; /* In dp_netdev_pmd_thread's
|
|
|
|
'simple_match_table'. */
|
2021-07-09 15:58:15 +00:00
|
|
|
const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
|
|
|
|
const ovs_u128 ufid; /* Unique flow identifier. */
|
|
|
|
const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
|
|
|
|
const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
|
|
|
|
/* flow. */
|
|
|
|
|
|
|
|
/* Number of references.
|
|
|
|
* The classifier owns one reference.
|
|
|
|
* Any thread trying to keep a rule from being freed should hold its own
|
|
|
|
* reference. */
|
|
|
|
struct ovs_refcount ref_cnt;
|
|
|
|
|
|
|
|
bool dead;
|
dpif-netdev: Forwarding optimization for flows with a simple match.
There are cases where users might want simple forwarding or drop rules
for all packets received from a specific port, e.g ::
"in_port=1,actions=2"
"in_port=2,actions=IN_PORT"
"in_port=3,vlan_tci=0x1234/0x1fff,actions=drop"
"in_port=4,actions=push_vlan:0x8100,set_field:4196->vlan_vid,output:3"
There are also cases where complex OpenFlow rules can be simplified
down to datapath flows with very simple match criteria.
In theory, for very simple forwarding, OVS doesn't need to parse
packets at all in order to follow these rules. "Simple match" lookup
optimization is intended to speed up packet forwarding in these cases.
Design:
Due to various implementation constraints userspace datapath has
following flow fields always in exact match (i.e. it's required to
match at least these fields of a packet even if the OF rule doesn't
need that):
- recirc_id
- in_port
- packet_type
- dl_type
- vlan_tci (CFI + VID) - in most cases
- nw_frag - for ip packets
Not all of these fields are related to packet itself. We already
know the current 'recirc_id' and the 'in_port' before starting the
packet processing. It also seems safe to assume that we're working
with Ethernet packets. So, for the simple OF rule we need to match
only on 'dl_type', 'vlan_tci' and 'nw_frag'.
'in_port', 'dl_type', 'nw_frag' and 13 bits of 'vlan_tci' can be
combined in a single 64bit integer (mark) that can be used as a
hash in hash map. We are using only VID and CFI form the 'vlan_tci',
flows that need to match on PCP will not qualify for the optimization.
Workaround for matching on non-existence of vlan updated to match on
CFI and VID only in order to qualify for the optimization. CFI is
always set by OVS if vlan is present in a packet, so there is no need
to match on PCP in this case. 'nw_frag' takes 2 bits of PCP inside
the simple match mark.
New per-PMD flow table 'simple_match_table' introduced to store
simple match flows only. 'dp_netdev_flow_add' adds flow to the
usual 'flow_table' and to the 'simple_match_table' if the flow
meets following constraints:
- 'recirc_id' in flow match is 0.
- 'packet_type' in flow match is Ethernet.
- Flow wildcards contains only minimal set of non-wildcarded fields
(listed above).
If the number of flows for current 'in_port' in a regular 'flow_table'
equals number of flows for current 'in_port' in a 'simple_match_table',
we may use simple match optimization, because all the flows we have
are simple match flows. This means that we only need to parse
'dl_type', 'vlan_tci' and 'nw_frag' to perform packet matching.
Now we make the unique flow mark from the 'in_port', 'dl_type',
'nw_frag' and 'vlan_tci' and looking for it in the 'simple_match_table'.
On successful lookup we don't need to run full 'miniflow_extract()'.
Unsuccessful lookup technically means that we have no suitable flow
in the datapath and upcall will be required. So, in this case EMC and
SMC lookups are disabled. We may optimize this path in the future by
bypassing the dpcls lookup too.
Performance improvement of this solution on a 'simple match' flows
should be comparable with partial HW offloading, because it parses same
packet fields and uses similar flow lookup scheme.
However, unlike partial HW offloading, it works for all port types
including virtual ones.
Performance results when compared to EMC:
Test setup:
virtio-user OVS virtio-user
Testpmd1 ------------> pmd1 ------------> Testpmd2
(txonly) x<------ pmd2 <------------ (mac swap)
Single stream of 64byte packets. Actions:
in_port=vhost0,actions=vhost1
in_port=vhost1,actions=vhost0
Stats collected from pmd1 and pmd2, so there are 2 scenarios:
Virt-to-Virt : Testpmd1 ------> pmd1 ------> Testpmd2.
Virt-to-NoCopy : Testpmd2 ------> pmd2 --->x Testpmd1.
Here the packet sent from pmd2 to Testpmd1 is always dropped, because
the virtqueue is full since Testpmd1 is in txonly mode and doesn't
receive any packets. This should be closer to the performance of a
VM-to-Phy scenario.
Test performed on machine with Intel Xeon CPU E5-2690 v4 @ 2.60GHz.
Table below represents improvement in throughput when compared to EMC.
+----------------+------------------------+------------------------+
| | Default (-g -O2) | "-Ofast -march=native" |
| Scenario +------------+-----------+------------+-----------+
| | GCC | Clang | GCC | Clang |
+----------------+------------+-----------+------------+-----------+
| Virt-to-Virt | +18.9% | +25.5% | +10.8% | +16.7% |
| Virt-to-NoCopy | +24.3% | +33.7% | +14.9% | +22.0% |
+----------------+------------+-----------+------------+-----------+
For Phy-to-Phy case performance improvement should be even higher, but
it's not the main use-case for this functionality. Performance
difference for the non-simple flows is within a margin of error.
Acked-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-09 14:57:52 +02:00
|
|
|
uint32_t mark; /* Unique flow mark for netdev offloading. */
|
|
|
|
uint64_t simple_match_mark; /* Unique flow mark for the simple match. */
|
2022-02-07 19:09:33 +02:00
|
|
|
odp_port_t orig_in_port;
|
2021-07-09 15:58:15 +00:00
|
|
|
|
|
|
|
/* Statistics. */
|
|
|
|
struct dp_netdev_flow_stats stats;
|
|
|
|
|
|
|
|
/* Statistics and attributes received from the netdev offload provider. */
|
|
|
|
atomic_int netdev_flow_get_result;
|
|
|
|
struct dp_netdev_flow_stats last_stats;
|
|
|
|
struct dp_netdev_flow_attrs last_attrs;
|
|
|
|
|
|
|
|
/* Actions. */
|
|
|
|
OVSRCU_TYPE(struct dp_netdev_actions *) actions;
|
|
|
|
|
|
|
|
/* While processing a group of input packets, the datapath uses the next
|
|
|
|
* member to store a pointer to the output batch for the flow. It is
|
|
|
|
* reset after the batch has been sent out (See dp_netdev_queue_batches(),
|
|
|
|
* packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
|
|
|
|
struct packet_batch_per_flow *batch;
|
|
|
|
|
|
|
|
/* Packet classification. */
|
|
|
|
char *dp_extra_info; /* String to return in a flow dump/get. */
|
|
|
|
struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
|
|
|
|
/* 'cr' must be the last member. */
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline uint32_t
|
|
|
|
dp_netdev_flow_hash(const ovs_u128 *ufid)
|
|
|
|
{
|
|
|
|
return ufid->u32[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Given the number of bits set in miniflow's maps, returns the size of the
|
|
|
|
* 'netdev_flow_key.mf' */
|
|
|
|
static inline size_t
|
|
|
|
netdev_flow_key_size(size_t flow_u64s)
|
|
|
|
{
|
|
|
|
return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* forward declaration required for EMC to unref flows */
|
|
|
|
void dp_netdev_flow_unref(struct dp_netdev_flow *);
|
|
|
|
|
|
|
|
/* A set of datapath actions within a "struct dp_netdev_flow".
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Thread-safety
|
|
|
|
* =============
|
|
|
|
*
|
|
|
|
* A struct dp_netdev_actions 'actions' is protected with RCU. */
|
|
|
|
struct dp_netdev_actions {
|
|
|
|
/* These members are immutable: they do not change during the struct's
|
|
|
|
* lifetime. */
|
|
|
|
unsigned int size; /* Size of 'actions', in bytes. */
|
|
|
|
struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* dpif-netdev-private-flow.h */
|