2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-23 02:17:42 +00:00
ovs/lib/netdev-dpdk.c

6800 lines
217 KiB
C
Raw Normal View History

/*
* Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "netdev-dpdk.h"
#include <errno.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/virtio_net.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <linux/if.h>
#include <rte_bus.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_dev.h>
#include <rte_errno.h>
#include <rte_ethdev.h>
#include <rte_flow.h>
#include <rte_malloc.h>
#include <rte_mbuf.h>
#include <rte_meter.h>
#include <rte_pci.h>
#include <rte_version.h>
#include <rte_vhost.h>
#include "cmap.h"
#include "coverage.h"
#include "dirs.h"
#include "dp-packet.h"
#include "dpdk.h"
#include "dpif-netdev.h"
#include "fatal-signal.h"
#include "if-notifier.h"
#include "mpsc-queue.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "odp-util.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/match.h"
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
#include "openvswitch/ofp-parse.h"
#include "openvswitch/ofp-print.h"
netdev-dpdk: Trigger port reconfiguration in main thread for resets. When OVS (main thread) configures a DPDK netdev, it holds a netdev_dpdk mutex lock. As part of this configure operation, the net/iavf driver (used with i40e VF devices) triggers a queue count change. The PF entity (serviced by a kernel PF driver for example) handles this change and requests back that the VF driver resets the VF device. The driver then completes the VF reset operation on its side and waits for completion of the iavf-event thread responsible for handling various VF device events. On the other hand, handling of the VF reset request in this iavf-event thread results in notifying the application with a port reset request (RTE_ETH_EVENT_INTR_RESET). The OVS reset callback tries to take a hold of the same netdev_dpdk mutex and blocks the iavf-event thread. As a result, the net/iavf driver (still running on OVS main thread) is unable to complete as it is waiting for iavf-event to complete. To break from this situation, the OVS reset callback now won't take a netdev_dpdk mutex. Instead, the port reset request is stored in a simple RTE_ETH_MAXPORTS array associated to a seq object. This is enough to let the VF driver complete this port initialization. The OVS main thread later handles the port reset request. More details in the DPDK upstream bz as this issue appeared following a change in DPDK. Link: https://bugs.dpdk.org/show_bug.cgi?id=1337 Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-01-18 17:15:00 +01:00
#include "openvswitch/poll-loop.h"
#include "openvswitch/shash.h"
#include "openvswitch/vlog.h"
#include "ovs-numa.h"
#include "ovs-rcu.h"
#include "ovs-thread.h"
#include "packets.h"
#include "smap.h"
#include "sset.h"
#include "timeval.h"
#include "unaligned.h"
#include "unixctl.h"
#include "userspace-tso.h"
#include "util.h"
#include "uuid.h"
enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
COVERAGE_DEFINE(vhost_tx_contention);
static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */
static bool vhost_postcopy_enabled = false; /* Status of vHost POSTCOPY
* support. */
static bool per_port_memory = false; /* Status of per port memory support */
#define DPDK_PORT_WATCHDOG_INTERVAL 5
#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
#define OVS_VPORT_DPDK "ovs_dpdk"
/*
* need to reserve tons of extra space in the mbufs so we can align the
* DMA addresses to 4KB.
* The minimum mbuf size is limited to avoid scatter behaviour and drop in
* performance for standard Ethernet MTU.
*/
#define ETHER_HDR_MAX_LEN (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN \
+ (2 * VLAN_HEADER_LEN))
#define MTU_TO_FRAME_LEN(mtu) ((mtu) + RTE_ETHER_HDR_LEN + \
RTE_ETHER_CRC_LEN)
#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
- RTE_ETHER_HDR_LEN - RTE_ETHER_CRC_LEN)
#define NETDEV_DPDK_MBUF_ALIGN 1024
#define NETDEV_DPDK_MAX_PKT_LEN 9728
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Max and min number of packets in the mempool. OVS tries to allocate a
* mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
* enough hugepages) we keep halving the number until the allocation succeeds
* or we reach MIN_NB_MBUF */
#define MAX_NB_MBUF (4096 * 64)
#define MIN_NB_MBUF (4096 * 4)
#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
== 0);
/* The smallest possible NB_MBUF that we're going to try should be a multiple
* of MP_CACHE_SZ. This is advised by DPDK documentation. */
BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
% MP_CACHE_SZ == 0);
#define SOCKET0 0
/* Default size of Physical NIC RXQ */
#define NIC_PORT_DEFAULT_RXQ_SIZE 2048
/* Default size of Physical NIC TXQ */
#define NIC_PORT_DEFAULT_TXQ_SIZE 2048
/* Maximum size of Physical NIC Queues */
#define NIC_PORT_MAX_Q_SIZE 4096
#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
#define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
* yet mapped to another queue. */
#define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
/* DPDK library uses uint16_t for port_id. */
typedef uint16_t dpdk_port_t;
#define DPDK_PORT_ID_FMT "%"PRIu16
/* Minimum amount of vhost tx retries, effectively a disable. */
#define VHOST_ENQ_RETRY_MIN 0
/* Maximum amount of vhost tx retries. */
#define VHOST_ENQ_RETRY_MAX 32
/* Legacy default value for vhost tx retries. */
#define VHOST_ENQ_RETRY_DEF 8
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
/* List of required flags advertised by the hardware that will be used
* if TSO is enabled. Ideally this should include
* RTE_ETH_TX_OFFLOAD_SCTP_CKSUM. However, very few drivers support that
* at the moment and SCTP is not a widely used protocol like TCP and UDP,
* so it's optional. */
#define DPDK_TX_TSO_OFFLOAD_FLAGS (RTE_ETH_TX_OFFLOAD_TCP_TSO \
| RTE_ETH_TX_OFFLOAD_TCP_CKSUM \
| RTE_ETH_TX_OFFLOAD_UDP_CKSUM \
| RTE_ETH_TX_OFFLOAD_IPV4_CKSUM)
static const struct rte_eth_conf port_conf = {
.rxmode = {
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
.offloads = 0,
},
.rx_adv_conf = {
.rss_conf = {
.rss_key = NULL,
.rss_hf = RTE_ETH_RSS_IP | RTE_ETH_RSS_UDP | RTE_ETH_RSS_TCP,
},
},
.txmode = {
.mq_mode = RTE_ETH_MQ_TX_NONE,
},
};
/*
* These callbacks allow virtio-net devices to be added to vhost ports when
* configuration has been fully completed.
*/
static int new_device(int vid);
static void destroy_device(int vid);
static int vring_state_changed(int vid, uint16_t queue_id, int enable);
static void destroy_connection(int vid);
static const struct rte_vhost_device_ops virtio_net_device_ops =
{
.new_device = new_device,
.destroy_device = destroy_device,
.vring_state_changed = vring_state_changed,
.features_changed = NULL,
.new_connection = NULL,
.destroy_connection = destroy_connection,
};
/* Custom software stats for dpdk ports */
struct netdev_dpdk_sw_stats {
/* No. of retries when unable to transmit. */
uint64_t tx_retries;
/* Packet drops when unable to transmit; Probably Tx queue is full. */
uint64_t tx_failure_drops;
/* Packet length greater than device MTU. */
uint64_t tx_mtu_exceeded_drops;
/* Packet drops in egress policer processing. */
uint64_t tx_qos_drops;
/* Packet drops in ingress policer processing. */
uint64_t rx_qos_drops;
/* Packet drops in HWOL processing. */
uint64_t tx_invalid_hwol_drops;
};
enum dpdk_dev_type {
DPDK_DEV_ETH = 0,
DPDK_DEV_VHOST = 1,
};
/* Quality of Service */
/* An instance of a QoS configuration. Always associated with a particular
* network device.
*
* Each QoS implementation subclasses this with whatever additional data it
* needs.
*/
struct qos_conf {
const struct dpdk_qos_ops *ops;
rte_spinlock_t lock;
};
/* QoS queue information used by the netdev queue dump functions. */
struct netdev_dpdk_queue_state {
uint32_t *queues;
size_t cur_queue;
size_t n_queues;
};
/* A particular implementation of dpdk QoS operations.
*
* The functions below return 0 if successful or a positive errno value on
* failure, except where otherwise noted. All of them must be provided, except
* where otherwise noted.
*/
struct dpdk_qos_ops {
/* Name of the QoS type */
const char *qos_name;
/* Called to construct a qos_conf object. The implementation should make
* the appropriate calls to configure QoS according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* This function must return 0 if and only if it sets '*conf' to an
* initialized 'struct qos_conf'.
*
* For all QoS implementations it should always be non-null.
*/
int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
/* Destroys the data structures allocated by the implementation as part of
* 'qos_conf'.
*
* For all QoS implementations it should always be non-null.
*/
void (*qos_destruct)(struct qos_conf *conf);
/* Retrieves details of 'conf' configuration into 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*/
int (*qos_get)(const struct qos_conf *conf, struct smap *details);
/* Returns true if 'conf' is already configured according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* For all QoS implementations it should always be non-null.
*/
bool (*qos_is_equal)(const struct qos_conf *conf,
const struct smap *details);
/* Modify an array of rte_mbufs. The modification is specific to
* each qos implementation.
*
* The function should take and array of mbufs and an int representing
* the current number of mbufs present in the array.
*
* After the function has performed a qos modification to the array of
* mbufs it returns an int representing the number of mbufs now present in
* the array. This value is can then be passed to the port send function
* along with the modified array for transmission.
*
* For all QoS implementations it should always be non-null.
*/
int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
int pkt_cnt, bool should_steal);
/* Called to construct a QoS Queue. The implementation should make
* the appropriate calls to configure QoS Queue according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* This function must return 0 if and only if it constructs
* QoS queue successfully.
*/
int (*qos_queue_construct)(const struct smap *details,
uint32_t queue_id, struct qos_conf *conf);
/* Destroys the QoS Queue. */
void (*qos_queue_destruct)(struct qos_conf *conf, uint32_t queue_id);
/* Retrieves details of QoS Queue configuration into 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*/
int (*qos_queue_get)(struct smap *details, uint32_t queue_id,
const struct qos_conf *conf);
/* Retrieves statistics of QoS Queue configuration into 'stats'. */
int (*qos_queue_get_stats)(const struct qos_conf *conf, uint32_t queue_id,
struct netdev_queue_stats *stats);
/* Setup the 'netdev_dpdk_queue_state' structure used by the dpdk queue
* dump functions.
*/
int (*qos_queue_dump_state_init)(const struct qos_conf *conf,
struct netdev_dpdk_queue_state *state);
};
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
/* dpdk_qos_ops for each type of user space QoS implementation. */
static const struct dpdk_qos_ops egress_policer_ops;
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
static const struct dpdk_qos_ops trtcm_policer_ops;
/*
* Array of dpdk_qos_ops, contains pointer to all supported QoS
* operations.
*/
static const struct dpdk_qos_ops *const qos_confs[] = {
&egress_policer_ops,
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
&trtcm_policer_ops,
NULL
};
static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
/* Contains all 'struct dpdk_dev's. */
static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
= OVS_LIST_INITIALIZER(&dpdk_list);
static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
= OVS_MUTEX_INITIALIZER;
/* Contains all 'struct dpdk_mp's. */
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
= OVS_LIST_INITIALIZER(&dpdk_mp_list);
struct dpdk_mp {
struct rte_mempool *mp;
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
int mtu;
int socket_id;
int refcount;
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
};
struct user_mempool_config {
int adj_mtu;
int socket_id;
};
static struct user_mempool_config *user_mempools = NULL;
static int n_user_mempools;
/* There should be one 'struct dpdk_tx_queue' created for
* each netdev tx queue. */
struct dpdk_tx_queue {
/* Padding to make dpdk_tx_queue exactly one cache line long. */
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* Protects the members and the NIC queue from concurrent access.
* It is used only if the queue is shared among different pmd threads
* (see 'concurrent_txq'). */
rte_spinlock_t tx_lock;
/* Mapping of configured vhost-user queue to enabled by guest. */
int map;
);
};
struct ingress_policer {
struct rte_meter_srtcm_params app_srtcm_params;
struct rte_meter_srtcm in_policer;
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
struct rte_meter_srtcm_profile in_prof;
rte_spinlock_t policer_lock;
};
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
enum dpdk_hw_ol_features {
NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
NETDEV_RX_HW_CRC_STRIP = 1 << 1,
NETDEV_RX_HW_SCATTER = 1 << 2,
NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3,
NETDEV_TX_TCP_CKSUM_OFFLOAD = 1 << 4,
NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
NETDEV_TX_TSO_OFFLOAD = 1 << 7,
NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
};
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
enum dpdk_rx_steer_flags {
DPDK_RX_STEER_LACP = 1 << 0,
};
/* Flags for the netdev_dpdk virtio_features_state field.
* This is used for the virtio features recovery mechanism linked to TSO
* support. */
#define OVS_VIRTIO_F_CLEAN (UINT8_C(1) << 0)
#define OVS_VIRTIO_F_WORKAROUND (UINT8_C(1) << 1)
#define OVS_VIRTIO_F_NEGOTIATED (UINT8_C(1) << 2)
#define OVS_VIRTIO_F_RECONF_PENDING (UINT8_C(1) << 3)
#define OVS_VIRTIO_F_CLEAN_NEGOTIATED \
(OVS_VIRTIO_F_CLEAN | OVS_VIRTIO_F_NEGOTIATED)
#define OVS_VIRTIO_F_WORKAROUND_NEGOTIATED \
(OVS_VIRTIO_F_WORKAROUND | OVS_VIRTIO_F_NEGOTIATED)
/*
* In order to avoid confusion in variables names, following naming convention
* should be used, if possible:
*
* 'struct netdev' : 'netdev'
* 'struct netdev_dpdk' : 'dev'
* 'struct netdev_rxq' : 'rxq'
* 'struct netdev_rxq_dpdk' : 'rx'
*
* Example:
* struct netdev *netdev = netdev_from_name(name);
* struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
*
* Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
* already defined.
*/
struct netdev_dpdk {
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
dpdk_port_t port_id;
/* If true, device was attached by rte_eth_dev_attach(). */
bool attached;
/* If true, rte_eth_dev_start() was successfully called */
bool started;
bool reset_needed;
/* 1 pad byte here. */
struct eth_addr hwaddr;
int mtu;
int socket_id;
int buf_size;
int max_packet_len;
enum dpdk_dev_type type;
enum netdev_flags flags;
netdev-dpdk: Fixed netdev_dpdk structure alignment Currently, the code tells us we have 4 pad bytes left in cacheline0 while actually we are 8 bytes short: struct netdev_dpdk { union { OVS_CACHE_LINE_MARKER cacheline0; /* 1 */ struct { dpdk_port_t port_id; /* 0 2 */ _Bool attached; /* 2 1 */ struct eth_addr hwaddr; /* 4 6 */ int mtu; /* 12 4 */ int socket_id; /* 16 4 */ int buf_size; /* 20 4 */ int max_packet_len; /* 24 4 */ enum dpdk_dev_type type; /* 28 4 */ enum netdev_flags flags; /* 32 4 */ char * devargs; /* 40 8 */ struct dpdk_tx_queue * tx_q; /* 48 8 */ struct rte_eth_link link; /* 56 8 */ int link_reset_cnt; /* 64 4 */ }; /* 72 */ uint8_t pad9[128]; /* 128 */ }; /* 0 128 */ /* --- cacheline 2 boundary (128 bytes) --- */ Re-located one member, link_reset_cnt, and now it's one cache line: struct netdev_dpdk { union { OVS_CACHE_LINE_MARKER cacheline0; /* 1 */ struct { dpdk_port_t port_id; /* 0 2 */ _Bool attached; /* 2 1 */ struct eth_addr hwaddr; /* 4 6 */ int mtu; /* 12 4 */ int socket_id; /* 16 4 */ int buf_size; /* 20 4 */ int max_packet_len; /* 24 4 */ enum dpdk_dev_type type; /* 28 4 */ enum netdev_flags flags; /* 32 4 */ int link_reset_cnt; /* 36 4 */ char * devargs; /* 40 8 */ struct dpdk_tx_queue * tx_q; /* 48 8 */ struct rte_eth_link link; /* 56 8 */ }; /* 64 */ uint8_t pad9[64]; /* 64 */ }; /* 0 64 */ /* --- cacheline 1 boundary (64 bytes) --- */ Fixes: 5e925ccc2a6f ("netdev-dpdk: DPDK v17.11 upgrade") Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-04-25 17:48:23 +02:00
int link_reset_cnt;
union {
/* Device arguments for dpdk ports. */
char *devargs;
/* Identifier used to distinguish vhost devices from each other. */
char *vhost_id;
};
struct dpdk_tx_queue *tx_q;
struct rte_eth_link link;
);
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
struct dpdk_mp *dpdk_mp;
/* virtio identifier for vhost devices */
ovsrcu_index vid;
/* True if vHost device is 'up' and has been reconfigured at least once */
bool vhost_reconfigured;
atomic_uint8_t vhost_tx_retries_max;
/* Flags for virtio features recovery mechanism. */
uint8_t virtio_features_state;
/* 1 pad byte here. */
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
struct netdev up;
/* In dpdk_list. */
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
/* QoS configuration and lock for the device */
OVSRCU_TYPE(struct qos_conf *) qos_conf;
/* Ingress Policer */
OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
uint32_t policer_rate;
uint32_t policer_burst;
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
/* Array of vhost rxq states, see vring_state_changed. */
bool *vhost_rxq_enabled;
/* Ensures that Rx metadata delivery is configured only once. */
bool rx_metadata_delivery_configured;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
struct netdev_stats stats;
struct netdev_dpdk_sw_stats *sw_stats;
/* Protects stats */
rte_spinlock_t stats_lock;
/* 36 pad bytes here. */
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* The following properties cannot be changed when a device is running,
* so we remember the request and update them next time
* netdev_dpdk*_reconfigure() is called */
int requested_mtu;
int requested_n_txq;
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
/* User input for n_rxq (see dpdk_set_rxq_config). */
int user_n_rxq;
/* user_n_rxq + an optional rx steering queue (see
* netdev_dpdk_reconfigure). This field is different from the other
* requested_* fields as it may contain a different value than the user
* input. */
int requested_n_rxq;
int requested_rxq_size;
int requested_txq_size;
/* Number of rx/tx descriptors for physical devices */
int rxq_size;
int txq_size;
/* Socket ID detected when vHost device is brought up */
int requested_socket_id;
/* Denotes whether vHost port is client/server mode */
uint64_t vhost_driver_flags;
/* DPDK-ETH Flow control */
struct rte_eth_fc_conf fc_conf;
/* DPDK-ETH hardware offload features,
* from the enum set 'dpdk_hw_ol_features' */
uint32_t hw_ol_features;
/* Properties for link state change detection mode.
* If lsc_interrupt_mode is set to false, poll mode is used,
* otherwise interrupt mode is used. */
bool requested_lsc_interrupt_mode;
bool lsc_interrupt_mode;
/* VF configuration. */
struct eth_addr requested_hwaddr;
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
/* Requested rx queue steering flags,
* from the enum set 'dpdk_rx_steer_flags'. */
uint64_t requested_rx_steer_flags;
uint64_t rx_steer_flags;
size_t rx_steer_flows_num;
struct rte_flow **rx_steer_flows;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* Names of all XSTATS counters */
struct rte_eth_xstat_name *rte_xstats_names;
int rte_xstats_names_size;
int rte_xstats_ids_size;
uint64_t *rte_xstats_ids;
);
};
struct netdev_rxq_dpdk {
struct netdev_rxq up;
dpdk_port_t port_id;
};
static void netdev_dpdk_destruct(struct netdev *netdev);
static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
static int netdev_dpdk_get_sw_custom_stats(const struct netdev *,
struct netdev_custom_stats *);
static void netdev_dpdk_configure_xstats(struct netdev_dpdk *dev);
static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
static void netdev_dpdk_mbuf_dump(const char *prefix, const char *message,
const struct rte_mbuf *);
static bool
is_dpdk_class(const struct netdev_class *class)
{
return class->destruct == netdev_dpdk_destruct
|| class->destruct == netdev_dpdk_vhost_destruct;
}
/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
* aligned at 1k or less. If a declared mbuf size is not a multiple of this
* value, insufficient buffers are allocated to accomodate the packet in its
* entirety. Furthermore, certain drivers need to ensure that there is also
* sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
* frames). If the RX buffer is too small, then the driver enables scatter RX
* behaviour, which reduces performance. To prevent this, use a buffer size
* that is closest to 'mtu', but which satisfies the aforementioned criteria.
*/
static uint32_t
dpdk_buf_size(int mtu)
{
return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu), NETDEV_DPDK_MBUF_ALIGN)
+ RTE_PKTMBUF_HEADROOM;
}
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
static int
dpdk_get_user_adjusted_mtu(int port_adj_mtu, int port_mtu, int port_socket_id)
{
int best_adj_user_mtu = INT_MAX;
for (unsigned i = 0; i < n_user_mempools; i++) {
int user_adj_mtu, user_socket_id;
user_adj_mtu = user_mempools[i].adj_mtu;
user_socket_id = user_mempools[i].socket_id;
if (port_adj_mtu > user_adj_mtu
|| (user_socket_id != INT_MAX
&& user_socket_id != port_socket_id)) {
continue;
}
if (user_adj_mtu < best_adj_user_mtu) {
/* This is the is the lowest valid user MTU. */
best_adj_user_mtu = user_adj_mtu;
if (best_adj_user_mtu == port_adj_mtu) {
/* Found an exact fit, no need to keep searching. */
break;
}
}
}
if (best_adj_user_mtu == INT_MAX) {
VLOG_DBG("No user configured shared mempool mbuf sizes found "
"suitable for port with MTU %d, NUMA %d.", port_mtu,
port_socket_id);
best_adj_user_mtu = port_adj_mtu;
} else {
VLOG_DBG("Found user configured shared mempool with mbufs "
"of size %d, suitable for port with MTU %d, NUMA %d.",
MTU_TO_FRAME_LEN(best_adj_user_mtu), port_mtu,
port_socket_id);
}
return best_adj_user_mtu;
}
/* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
*
* Unlike xmalloc(), this function can return NULL on failure. */
static void *
dpdk_rte_mzalloc(size_t sz)
{
return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
}
void
free_dpdk_buf(struct dp_packet *p)
{
netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads DPDK mempools rely on rte_lcore_id() to implement a thread-local cache. Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to the "thread-local" cache, causing crashes. This commit resolves the issue with the following changes: - Every non pmd thread has the same lcore_id (0, for management reasons), which is not shared with any pmd thread (lcore_id for pmd threads now start from 1) - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be held. - The previous change does not allow us anymore to pass DPDK mbufs to handler threads: therefore this commit partially revert 143859ec63d45e. Now packets are copied for upcall processing. We can remove the extra memcpy by processing upcalls in the pmd thread itself. With the introduction of the extra locking, the packet throughput will be lower in the following cases: - When using internal (tap) devices with DPDK devices on the same datapath. Anyway, to support internal devices efficiently, we needed DPDK KNI devices, which will be proper pmd devices and will not need this locking. - When packets are processed in the slow path by non pmd threads. This overhead can be avoided by handling the upcalls directly in pmd threads (a change that has already been proposed by Ryan Wilson) Also, the following two fixes have been introduced: - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put(). This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option - Do not bulk free mbufs in a transmission queue. They may belong to different mempools Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2014-07-17 14:29:36 -07:00
struct rte_mbuf *pkt = (struct rte_mbuf *) p;
rte_pktmbuf_free(pkt);
}
static void
ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
void *opaque_arg OVS_UNUSED,
void *_p,
unsigned i OVS_UNUSED)
{
struct rte_mbuf *pkt = _p;
dp_packet_init_dpdk((struct dp_packet *) pkt);
}
static int
dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
{
/* At this point we want to know if all the mbufs are back
* in the mempool. rte_mempool_full() is not atomic but it's
* the best available and as we are no longer requesting mbufs
* from the mempool, it means mbufs will not move from
* 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
* the ring is counted before caches, so we won't get false
* positives in this use case and we handle false negatives.
*
* If future implementations of rte_mempool_full() were to change
* it could be possible for a false positive. Even that would
* likely be ok, as there are additional checks during mempool
* freeing but it would make things racey.
*/
return rte_mempool_full(mp);
}
/* Free unused mempools. */
static void
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
{
struct dpdk_mp *dmp;
LIST_FOR_EACH_SAFE (dmp, list_node, &dpdk_mp_list) {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
ovs_list_remove(&dmp->list_node);
rte_mempool_free(dmp->mp);
rte_free(dmp);
}
}
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Calculating the required number of mbufs differs depending on the
* mempool model being used. Check if per port memory is in use before
* calculating.
*/
static uint32_t
dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu)
{
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
uint32_t n_mbufs;
if (!per_port_memory) {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Shared memory are being used.
* XXX: this is a really rough method of provisioning memory.
* It's impossible to determine what the exact memory requirements are
* when the number of ports and rxqs that utilize a particular mempool
* can change dynamically at runtime. For now, use this rough
* heurisitic.
*/
if (mtu >= RTE_ETHER_MTU) {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
n_mbufs = MAX_NB_MBUF;
} else {
n_mbufs = MIN_NB_MBUF;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
} else {
/* Per port memory is being used.
* XXX: rough estimation of number of mbufs required for this port:
* <packets required to fill the device rxqs>
* + <packets that could be stuck on other ports txqs>
* + <packets in the pmd threads>
* + <additional memory for corner cases>
*/
n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
+ dev->requested_n_txq * dev->requested_txq_size
+ MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
+ MIN_NB_MBUF;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
return n_mbufs;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
static struct dpdk_mp *
dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
{
char mp_name[RTE_MEMPOOL_NAMESIZE];
const char *netdev_name = netdev_get_name(&dev->up);
int socket_id = dev->requested_socket_id;
netdev-dpdk: fix mbuf sizing There are numerous factors that must be considered when calculating the size of an mbuf: - the data portion of the mbuf must be sized in accordance With Rx buffer alignment (typically 1024B). So, for example, in order to successfully receive and capture a 1500B packet, mbufs with a data portion of size 2048B must be used. - in OvS, the elements that comprise an mbuf are: * the dp packet, which includes a struct rte mbuf (704B) * RTE_PKTMBUF_HEADROOM (128B) * packet data (aligned to 1k, as previously described) * RTE_PKTMBUF_TAILROOM (typically 0) Some PMDs require that the total mbuf size (i.e. the total sum of all of the above-listed components' lengths) is cache-aligned. To satisfy this requirement, it may be necessary to round up the total mbuf size with respect to cacheline size. In doing so, it's possible that the dp_packet's data portion is inadvertently increased in size, such that it no longer adheres to Rx buffer alignment. Consequently, the following property of the mbuf no longer holds true: mbuf.data_len == mbuf.buf_len - mbuf.data_off This creates a problem in the case of multi-segment mbufs, where that assumption is assumed to be true for all but the final segment in an mbuf chain. Resolve this issue by adjusting the size of the mbuf's private data portion, as opposed to the packet data portion when aligning mbuf size to cachelines. Co-authored-by: Tiago Lam <tiago.lam@intel.com> Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization") Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size") CC: Santosh Shukla <santosh.shukla@caviumnetworks.com> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Tiago Lam <tiago.lam@intel.com> Acked-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-11-02 09:06:32 +00:00
uint32_t n_mbufs = 0;
uint32_t mbuf_size = 0;
uint32_t aligned_mbuf_size = 0;
uint32_t mbuf_priv_data_len = 0;
uint32_t pkt_size = 0;
uint32_t hash = hash_string(netdev_name, 0);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
struct dpdk_mp *dmp = NULL;
int ret;
dmp = dpdk_rte_mzalloc(sizeof *dmp);
if (!dmp) {
return NULL;
}
dmp->socket_id = socket_id;
dmp->mtu = mtu;
dmp->refcount = 1;
netdev-dpdk: fix mbuf sizing There are numerous factors that must be considered when calculating the size of an mbuf: - the data portion of the mbuf must be sized in accordance With Rx buffer alignment (typically 1024B). So, for example, in order to successfully receive and capture a 1500B packet, mbufs with a data portion of size 2048B must be used. - in OvS, the elements that comprise an mbuf are: * the dp packet, which includes a struct rte mbuf (704B) * RTE_PKTMBUF_HEADROOM (128B) * packet data (aligned to 1k, as previously described) * RTE_PKTMBUF_TAILROOM (typically 0) Some PMDs require that the total mbuf size (i.e. the total sum of all of the above-listed components' lengths) is cache-aligned. To satisfy this requirement, it may be necessary to round up the total mbuf size with respect to cacheline size. In doing so, it's possible that the dp_packet's data portion is inadvertently increased in size, such that it no longer adheres to Rx buffer alignment. Consequently, the following property of the mbuf no longer holds true: mbuf.data_len == mbuf.buf_len - mbuf.data_off This creates a problem in the case of multi-segment mbufs, where that assumption is assumed to be true for all but the final segment in an mbuf chain. Resolve this issue by adjusting the size of the mbuf's private data portion, as opposed to the packet data portion when aligning mbuf size to cachelines. Co-authored-by: Tiago Lam <tiago.lam@intel.com> Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization") Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size") CC: Santosh Shukla <santosh.shukla@caviumnetworks.com> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Tiago Lam <tiago.lam@intel.com> Acked-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-11-02 09:06:32 +00:00
/* Get the size of each mbuf, based on the MTU */
mbuf_size = MTU_TO_FRAME_LEN(mtu);
n_mbufs = dpdk_calculate_mbufs(dev, mtu);
do {
/* Full DPDK memory pool name must be unique and cannot be
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
* longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
* mempool case this can result in one device using a mempool
* which references a different device in it's name. However as
* mempool names are hashed, the device name will not be readable
* so this is not an issue for tasks such as debugging.
*/
ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
netdev-dpdk: fix mbuf sizing There are numerous factors that must be considered when calculating the size of an mbuf: - the data portion of the mbuf must be sized in accordance With Rx buffer alignment (typically 1024B). So, for example, in order to successfully receive and capture a 1500B packet, mbufs with a data portion of size 2048B must be used. - in OvS, the elements that comprise an mbuf are: * the dp packet, which includes a struct rte mbuf (704B) * RTE_PKTMBUF_HEADROOM (128B) * packet data (aligned to 1k, as previously described) * RTE_PKTMBUF_TAILROOM (typically 0) Some PMDs require that the total mbuf size (i.e. the total sum of all of the above-listed components' lengths) is cache-aligned. To satisfy this requirement, it may be necessary to round up the total mbuf size with respect to cacheline size. In doing so, it's possible that the dp_packet's data portion is inadvertently increased in size, such that it no longer adheres to Rx buffer alignment. Consequently, the following property of the mbuf no longer holds true: mbuf.data_len == mbuf.buf_len - mbuf.data_off This creates a problem in the case of multi-segment mbufs, where that assumption is assumed to be true for all but the final segment in an mbuf chain. Resolve this issue by adjusting the size of the mbuf's private data portion, as opposed to the packet data portion when aligning mbuf size to cachelines. Co-authored-by: Tiago Lam <tiago.lam@intel.com> Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization") Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size") CC: Santosh Shukla <santosh.shukla@caviumnetworks.com> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Tiago Lam <tiago.lam@intel.com> Acked-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-11-02 09:06:32 +00:00
"ovs%08x%02d%05d%07u",
hash, socket_id, mtu, n_mbufs);
if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
VLOG_DBG("snprintf returned %d. "
"Failed to generate a mempool name for \"%s\". "
"Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
ret, netdev_name, hash, socket_id, mtu, n_mbufs);
break;
}
netdev-dpdk: fix mbuf sizing There are numerous factors that must be considered when calculating the size of an mbuf: - the data portion of the mbuf must be sized in accordance With Rx buffer alignment (typically 1024B). So, for example, in order to successfully receive and capture a 1500B packet, mbufs with a data portion of size 2048B must be used. - in OvS, the elements that comprise an mbuf are: * the dp packet, which includes a struct rte mbuf (704B) * RTE_PKTMBUF_HEADROOM (128B) * packet data (aligned to 1k, as previously described) * RTE_PKTMBUF_TAILROOM (typically 0) Some PMDs require that the total mbuf size (i.e. the total sum of all of the above-listed components' lengths) is cache-aligned. To satisfy this requirement, it may be necessary to round up the total mbuf size with respect to cacheline size. In doing so, it's possible that the dp_packet's data portion is inadvertently increased in size, such that it no longer adheres to Rx buffer alignment. Consequently, the following property of the mbuf no longer holds true: mbuf.data_len == mbuf.buf_len - mbuf.data_off This creates a problem in the case of multi-segment mbufs, where that assumption is assumed to be true for all but the final segment in an mbuf chain. Resolve this issue by adjusting the size of the mbuf's private data portion, as opposed to the packet data portion when aligning mbuf size to cachelines. Co-authored-by: Tiago Lam <tiago.lam@intel.com> Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization") Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size") CC: Santosh Shukla <santosh.shukla@caviumnetworks.com> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Tiago Lam <tiago.lam@intel.com> Acked-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-11-02 09:06:32 +00:00
VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
"on socket %d for %d Rx and %d Tx queues, "
"cache line size of %u",
netdev_name, n_mbufs, mbuf_size, socket_id,
dev->requested_n_rxq, dev->requested_n_txq,
RTE_CACHE_LINE_SIZE);
/* The size of the mbuf's private area (i.e. area that holds OvS'
* dp_packet data)*/
netdev-dpdk: fix mbuf sizing There are numerous factors that must be considered when calculating the size of an mbuf: - the data portion of the mbuf must be sized in accordance With Rx buffer alignment (typically 1024B). So, for example, in order to successfully receive and capture a 1500B packet, mbufs with a data portion of size 2048B must be used. - in OvS, the elements that comprise an mbuf are: * the dp packet, which includes a struct rte mbuf (704B) * RTE_PKTMBUF_HEADROOM (128B) * packet data (aligned to 1k, as previously described) * RTE_PKTMBUF_TAILROOM (typically 0) Some PMDs require that the total mbuf size (i.e. the total sum of all of the above-listed components' lengths) is cache-aligned. To satisfy this requirement, it may be necessary to round up the total mbuf size with respect to cacheline size. In doing so, it's possible that the dp_packet's data portion is inadvertently increased in size, such that it no longer adheres to Rx buffer alignment. Consequently, the following property of the mbuf no longer holds true: mbuf.data_len == mbuf.buf_len - mbuf.data_off This creates a problem in the case of multi-segment mbufs, where that assumption is assumed to be true for all but the final segment in an mbuf chain. Resolve this issue by adjusting the size of the mbuf's private data portion, as opposed to the packet data portion when aligning mbuf size to cachelines. Co-authored-by: Tiago Lam <tiago.lam@intel.com> Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization") Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size") CC: Santosh Shukla <santosh.shukla@caviumnetworks.com> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Tiago Lam <tiago.lam@intel.com> Acked-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-11-02 09:06:32 +00:00
mbuf_priv_data_len = sizeof(struct dp_packet) -
sizeof(struct rte_mbuf);
/* The size of the entire dp_packet. */
pkt_size = sizeof(struct dp_packet) + mbuf_size;
/* mbuf size, rounded up to cacheline size. */
aligned_mbuf_size = ROUND_UP(pkt_size, RTE_CACHE_LINE_SIZE);
/* If there is a size discrepancy, add padding to mbuf_priv_data_len.
* This maintains mbuf size cache alignment, while also honoring RX
* buffer alignment in the data portion of the mbuf. If this adjustment
* is not made, there is a possiblity later on that for an element of
* the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
* This is problematic in the case of multi-segment mbufs, particularly
* when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
* header, for example.
*/
mbuf_priv_data_len += (aligned_mbuf_size - pkt_size);
dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
mbuf_priv_data_len,
mbuf_size,
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
socket_id);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
if (dmp->mp) {
VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
mp_name, n_mbufs);
/* rte_pktmbuf_pool_create has done some initialization of the
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
* rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
* initializes some OVS specific fields of dp_packet.
*/
rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
return dmp;
} else if (rte_errno == EEXIST) {
/* A mempool with the same name already exists. We just
* retrieve its pointer to be returned to the caller. */
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dmp->mp = rte_mempool_lookup(mp_name);
/* As the mempool create returned EEXIST we can expect the
* lookup has returned a valid pointer. If for some reason
* that's not the case we keep track of it. */
VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
mp_name, dmp->mp);
return dmp;
} else {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
VLOG_DBG("Failed to create mempool \"%s\" with a request of "
"%u mbufs, retrying with %u mbufs",
mp_name, n_mbufs, n_mbufs / 2);
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
} while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
mp_name, n_mbufs);
rte_free(dmp);
return NULL;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
static struct dpdk_mp *
dpdk_mp_get(struct netdev_dpdk *dev, int mtu)
{
struct dpdk_mp *dmp = NULL, *next;
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
bool reuse = false;
ovs_mutex_lock(&dpdk_mp_mutex);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Check if shared memory is being used, if so check existing mempools
* to see if reuse is possible. */
if (!per_port_memory) {
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
/* If user has provided defined mempools, check if one is suitable
* and get new buffer size.*/
mtu = dpdk_get_user_adjusted_mtu(mtu, dev->requested_mtu,
dev->requested_socket_id);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
if (dmp->socket_id == dev->requested_socket_id
&& dmp->mtu == mtu) {
VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
dmp->refcount++;
reuse = true;
break;
}
}
}
/* Sweep mempools after reuse or before create. */
dpdk_mp_sweep();
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
if (!reuse) {
dmp = dpdk_mp_create(dev, mtu);
if (dmp) {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Shared memory will hit the reuse case above so will not
* request a mempool that already exists but we need to check
* for the EEXIST case for per port memory case. Compare the
* mempool returned by dmp to each entry in dpdk_mp_list. If a
* match is found, free dmp as a new entry is not required, set
* dmp to point to the existing entry and increment the refcount
* to avoid being freed at a later stage.
*/
if (per_port_memory && rte_errno == EEXIST) {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
if (dmp->mp == next->mp) {
rte_free(dmp);
dmp = next;
dmp->refcount++;
}
}
} else {
ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
}
}
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
ovs_mutex_unlock(&dpdk_mp_mutex);
return dmp;
}
/* Decrement reference to a mempool. */
static void
dpdk_mp_put(struct dpdk_mp *dmp)
{
if (!dmp) {
return;
}
ovs_mutex_lock(&dpdk_mp_mutex);
ovs_assert(dmp->refcount);
dmp->refcount--;
ovs_mutex_unlock(&dpdk_mp_mutex);
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Depending on the memory model being used this function tries to
* identify and reuse an existing mempool or tries to allocate a new
* mempool on requested_socket_id with mbuf size corresponding to the
* requested_mtu. On success, a new configuration will be applied.
* On error, device will be left unchanged. */
static int
netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
struct dpdk_mp *dmp;
int ret = 0;
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* With shared memory we do not need to configure a mempool if the MTU
* and socket ID have not changed, the previous configuration is still
* valid so return 0 */
if (!per_port_memory && dev->mtu == dev->requested_mtu
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
&& dev->socket_id == dev->requested_socket_id) {
return ret;
}
dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size));
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
if (!dmp) {
VLOG_ERR("Failed to create memory pool for netdev "
"%s, with MTU %d on socket %d: %s\n",
dev->up.name, dev->requested_mtu, dev->requested_socket_id,
rte_strerror(rte_errno));
ret = rte_errno;
} else {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Check for any pre-existing dpdk_mp for the device before accessing
* the associated mempool.
*/
if (dev->dpdk_mp != NULL) {
/* A new MTU was requested, decrement the reference count for the
* devices current dpdk_mp. This is required even if a pointer to
* same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
* has already been incremented by dpdk_mp_get at this stage so it
* must be decremented to keep an accurate refcount for the
* dpdk_mp.
*/
dpdk_mp_put(dev->dpdk_mp);
}
dev->dpdk_mp = dmp;
dev->mtu = dev->requested_mtu;
dev->socket_id = dev->requested_socket_id;
dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
}
return ret;
}
static void
check_link_status(struct netdev_dpdk *dev)
{
struct rte_eth_link link;
rte_eth_link_get_nowait(dev->port_id, &link);
if (dev->link.link_status != link.link_status) {
netdev_change_seq_changed(&dev->up);
dev->link_reset_cnt++;
dev->link = link;
if (dev->link.link_status) {
VLOG_DBG_RL(&rl,
"Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
dev->port_id, (unsigned) dev->link.link_speed,
(dev->link.link_duplex == RTE_ETH_LINK_FULL_DUPLEX)
? "full-duplex" : "half-duplex");
} else {
VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
dev->port_id);
}
}
}
static void *
dpdk_watchdog(void *dummy OVS_UNUSED)
{
struct netdev_dpdk *dev;
pthread_detach(pthread_self());
for (;;) {
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (dev->type == DPDK_DEV_ETH) {
check_link_status(dev);
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
}
return NULL;
}
static void
netdev_dpdk_update_netdev_flag(struct netdev_dpdk *dev,
enum dpdk_hw_ol_features hw_ol_features,
enum netdev_ol_flags flag)
OVS_REQUIRES(dev->mutex)
{
struct netdev *netdev = &dev->up;
if (dev->hw_ol_features & hw_ol_features) {
netdev->ol_flags |= flag;
} else {
netdev->ol_flags &= ~flag;
}
}
static void
netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_IPV4_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_IPV4_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TCP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_TCP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_UDP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_UDP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_SCTP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_SCTP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
NETDEV_TX_OFFLOAD_TCP_TSO);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
NETDEV_TX_VXLAN_TNL_TSO);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
NETDEV_TX_GENEVE_TNL_TSO);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
}
static int
dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
{
int diag = 0;
int i;
struct rte_eth_conf conf = port_conf;
struct rte_eth_dev_info info;
uint16_t conf_mtu;
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
rte_eth_dev_info_get(dev->port_id, &info);
/* As of DPDK 17.11.1 a few PMDs require to explicitly enable
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
* scatter to support jumbo RX.
* Setting scatter for the device is done after checking for
* scatter support in the device capabilites. */
if (dev->mtu > RTE_ETHER_MTU) {
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
if (dev->hw_ol_features & NETDEV_RX_HW_SCATTER) {
conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_SCATTER;
}
}
conf.intr_conf.lsc = dev->lsc_interrupt_mode;
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_CHECKSUM;
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
}
if (!(dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP)
&& info.rx_offload_capa & RTE_ETH_RX_OFFLOAD_KEEP_CRC) {
conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_KEEP_CRC;
}
if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_TCP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_UDP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_SCTP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
}
if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
}
if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
}
if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
}
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
/* Limit configured rss hash functions to only those supported
* by the eth device. */
conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
if (conf.rx_adv_conf.rss_conf.rss_hf == 0) {
conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE;
} else {
conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
}
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
/* A device may report more queues than it makes available (this has
* been observed for Intel xl710, which reserves some of them for
* SRIOV): rte_eth_*_queue_setup will fail if a queue is not
* available. When this happens we can retry the configuration
* and request less queues */
while (n_rxq && n_txq) {
if (diag) {
VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
}
diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
if (diag) {
VLOG_WARN("Interface %s eth_dev setup error %s\n",
dev->up.name, rte_strerror(-diag));
break;
}
diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
if (diag) {
/* A device may not support rte_eth_dev_set_mtu, in this case
* flag a warning to the user and include the devices configured
* MTU value that will be used instead. */
if (-ENOTSUP == diag) {
rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
VLOG_WARN("Interface %s does not support MTU configuration, "
"max packet size supported is %"PRIu16".",
dev->up.name, conf_mtu);
} else {
VLOG_ERR("Interface %s MTU (%d) setup error: %s",
dev->up.name, dev->mtu, rte_strerror(-diag));
break;
}
}
for (i = 0; i < n_txq; i++) {
diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
dev->socket_id, NULL);
if (diag) {
VLOG_INFO("Interface %s unable to setup txq(%d): %s",
dev->up.name, i, rte_strerror(-diag));
break;
}
}
if (i != n_txq) {
/* Retry with less tx queues */
n_txq = i;
continue;
}
for (i = 0; i < n_rxq; i++) {
diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dev->socket_id, NULL,
dev->dpdk_mp->mp);
if (diag) {
VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
dev->up.name, i, rte_strerror(-diag));
break;
}
}
if (i != n_rxq) {
/* Retry with less rx queues */
n_rxq = i;
continue;
}
dev->up.n_rxq = n_rxq;
dev->up.n_txq = n_txq;
return 0;
}
return diag;
}
static void
dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
{
if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
dev->port_id);
}
}
static void
dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev)
{
uint64_t rx_metadata = 0;
int ret;
if (dev->rx_metadata_delivery_configured) {
return;
}
/* For the fallback offload (non-"transfer" rules). */
rx_metadata |= RTE_ETH_RX_METADATA_USER_MARK;
#ifdef ALLOW_EXPERIMENTAL_API
/* For the tunnel offload. */
rx_metadata |= RTE_ETH_RX_METADATA_TUNNEL_ID;
#endif /* ALLOW_EXPERIMENTAL_API */
ret = rte_eth_rx_metadata_negotiate(dev->port_id, &rx_metadata);
if (ret == 0) {
if (!(rx_metadata & RTE_ETH_RX_METADATA_USER_MARK)) {
VLOG_DBG("%s: The NIC will not provide per-packet USER_MARK",
netdev_get_name(&dev->up));
}
#ifdef ALLOW_EXPERIMENTAL_API
if (!(rx_metadata & RTE_ETH_RX_METADATA_TUNNEL_ID)) {
VLOG_DBG("%s: The NIC will not provide per-packet TUNNEL_ID",
netdev_get_name(&dev->up));
}
#endif /* ALLOW_EXPERIMENTAL_API */
} else {
VLOG(ret == -ENOTSUP ? VLL_DBG : VLL_WARN,
"%s: Cannot negotiate Rx metadata: %s",
netdev_get_name(&dev->up), rte_strerror(-ret));
}
dev->rx_metadata_delivery_configured = true;
}
static int
dpdk_eth_dev_init(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
struct rte_pktmbuf_pool_private *mbp_priv;
struct rte_eth_dev_info info;
struct rte_ether_addr eth_addr;
int diag;
int n_rxq, n_txq;
uint32_t rx_chksm_offload_capa = RTE_ETH_RX_OFFLOAD_UDP_CKSUM |
RTE_ETH_RX_OFFLOAD_TCP_CKSUM |
RTE_ETH_RX_OFFLOAD_IPV4_CKSUM;
if (netdev_is_flow_api_enabled()) {
/*
* Full tunnel offload requires that tunnel ID metadata be
* delivered with "miss" packets from the hardware to the
* PMD. The same goes for megaflow mark metadata which is
* used in MARK + RSS offload scenario.
*
* Request delivery of such metadata.
*/
dpdk_eth_dev_init_rx_metadata(dev);
}
rte_eth_dev_info_get(dev->port_id, &info);
if (strstr(info.driver_name, "vf") != NULL) {
VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
} else {
dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
}
if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
rx_chksm_offload_capa) {
VLOG_WARN("Rx checksum offload is not supported on port "
DPDK_PORT_ID_FMT, dev->port_id);
dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
} else {
dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
}
if (info.rx_offload_capa & RTE_ETH_RX_OFFLOAD_SCATTER) {
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
dev->hw_ol_features |= NETDEV_RX_HW_SCATTER;
} else {
/* Do not warn on lack of scatter support */
dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER;
}
if (!strcmp(info.driver_name, "net_tap")) {
/* FIXME: L4 checksum offloading is broken in DPDK net/tap driver.
* This workaround can be removed once the fix makes it to a DPDK
* LTS release used by OVS. */
VLOG_INFO("%s: disabled Tx L4 checksum offloads for a net/tap port.",
netdev_get_name(&dev->up));
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_UDP_CKSUM;
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
}
if (!strcmp(info.driver_name, "net_ice")
|| !strcmp(info.driver_name, "net_i40e")
|| !strcmp(info.driver_name, "net_iavf")) {
/* FIXME: Driver advertises the capability but doesn't seem
* to actually support it correctly. Can remove this once
* the driver is fixed on DPDK side. */
VLOG_INFO("%s: disabled Tx outer udp checksum offloads for a "
"net/ice, net/i40e or net/iavf port.",
netdev_get_name(&dev->up));
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_TCP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_UDP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_UDP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
}
dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
if (userspace_tso_enabled()) {
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) {
dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
} else {
VLOG_WARN("%s: Tx TSO offload is not supported.",
netdev_get_name(&dev->up));
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
} else {
VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
netdev_get_name(&dev->up));
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
} else {
VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
netdev_get_name(&dev->up));
}
}
n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
if (diag) {
VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
"configure error: %s",
dev->up.name, n_rxq, n_txq,
dev->lsc_interrupt_mode ? "true" : "false",
rte_strerror(-diag));
return -diag;
}
diag = rte_eth_dev_start(dev->port_id);
if (diag) {
VLOG_ERR("Interface %s start error: %s", dev->up.name,
rte_strerror(-diag));
return -diag;
}
dev->started = true;
netdev_dpdk_configure_xstats(dev);
rte_eth_promiscuous_enable(dev->port_id);
rte_eth_allmulticast_enable(dev->port_id);
memset(&eth_addr, 0x0, sizeof(eth_addr));
rte_eth_macaddr_get(dev->port_id, &eth_addr);
VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
rte_eth_link_get_nowait(dev->port_id, &dev->link);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
return 0;
}
static struct netdev_dpdk *
netdev_dpdk_cast(const struct netdev *netdev)
{
return CONTAINER_OF(netdev, struct netdev_dpdk, up);
}
static struct netdev *
netdev_dpdk_alloc(void)
{
struct netdev_dpdk *dev;
dev = dpdk_rte_mzalloc(sizeof *dev);
if (dev) {
return &dev->up;
}
return NULL;
}
static struct dpdk_tx_queue *
netdev_dpdk_alloc_txq(unsigned int n_txqs)
{
struct dpdk_tx_queue *txqs;
unsigned i;
txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
if (txqs) {
for (i = 0; i < n_txqs; i++) {
/* Initialize map for vhost devices. */
txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
rte_spinlock_init(&txqs[i].tx_lock);
}
}
return txqs;
}
static int
common_construct(struct netdev *netdev, dpdk_port_t port_no,
enum dpdk_dev_type type, int socket_id)
OVS_REQUIRES(dpdk_mutex)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_init(&dev->mutex);
rte_spinlock_init(&dev->stats_lock);
/* If the 'sid' is negative, it means that the kernel fails
* to obtain the pci numa info. In that situation, always
* use 'SOCKET0'. */
dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
dev->requested_socket_id = dev->socket_id;
dev->port_id = port_no;
dev->type = type;
dev->flags = 0;
dev->requested_mtu = RTE_ETHER_MTU;
dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
dev->requested_lsc_interrupt_mode = 0;
ovsrcu_index_init(&dev->vid, -1);
dev->vhost_reconfigured = false;
dev->virtio_features_state = OVS_VIRTIO_F_CLEAN;
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
dev->attached = false;
dev->started = false;
dev->reset_needed = false;
ovsrcu_init(&dev->qos_conf, NULL);
ovsrcu_init(&dev->ingress_policer, NULL);
dev->policer_rate = 0;
dev->policer_burst = 0;
netdev->n_rxq = 0;
netdev->n_txq = 0;
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
dev->user_n_rxq = NR_QUEUE;
dev->requested_n_rxq = NR_QUEUE;
dev->requested_n_txq = NR_QUEUE;
dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
dev->requested_rx_steer_flags = 0;
dev->rx_steer_flags = 0;
dev->rx_steer_flows_num = 0;
dev->rx_steer_flows = NULL;
/* Initialize the flow control to NULL */
memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
/* Initilize the hardware offload flags to 0 */
dev->hw_ol_features = 0;
dev->rx_metadata_delivery_configured = false;
dev->flags = NETDEV_UP | NETDEV_PROMISC;
ovs_list_push_back(&dpdk_list, &dev->list_node);
netdev_request_reconfigure(netdev);
dev->rte_xstats_names = NULL;
dev->rte_xstats_names_size = 0;
dev->rte_xstats_ids = NULL;
dev->rte_xstats_ids_size = 0;
dev->sw_stats = xzalloc(sizeof *dev->sw_stats);
dev->sw_stats->tx_retries = (dev->type == DPDK_DEV_VHOST) ? 0 : UINT64_MAX;
return 0;
}
static int
vhost_common_construct(struct netdev *netdev)
OVS_REQUIRES(dpdk_mutex)
{
dpdk: Update to use DPDK v20.11. This commit adds support for DPDK v20.11, it includes the following changes. 1. travis: Remove explicit DPDK kmods configuration. 2. sparse: Fix build with 20.05 DPDK tracepoints. 3. netdev-dpdk: Remove experimental API flag. http://patchwork.ozlabs.org/project/openvswitch/list/?series=173216&state=* 4. sparse: Update to DPDK 20.05 trace point header. http://patchwork.ozlabs.org/project/openvswitch/list/?series=179604&state=* 5. sparse: Fix build with DPDK 20.08. http://patchwork.ozlabs.org/project/openvswitch/list/?series=200181&state=* 6. build: Add support for DPDK meson build. http://patchwork.ozlabs.org/project/openvswitch/list/?series=199138&state=* 7. netdev-dpdk: Remove usage of RTE_ETH_DEV_CLOSE_REMOVE flag. http://patchwork.ozlabs.org/project/openvswitch/list/?series=207850&state=* 8. netdev-dpdk: Fix build with 20.11-rc1. http://patchwork.ozlabs.org/project/openvswitch/list/?series=209006&state=* 9. sparse: Fix __ATOMIC_* redefinition errors http://patchwork.ozlabs.org/project/openvswitch/list/?series=209452&state=* 10. build: Remove DPDK make build references. http://patchwork.ozlabs.org/project/openvswitch/list/?series=216682&state=* For credit all authors of the original commits to 'dpdk-latest' with the above changes have been added as co-authors for this commit. Signed-off-by: David Marchand <david.marchand@redhat.com> Co-authored-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Sunil Pai G <sunil.pai.g@intel.com> Co-authored-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Eli Britstein <elibr@nvidia.com> Co-authored-by: Eli Britstein <elibr@nvidia.com> Tested-by: Harry van Haaren <harry.van.haaren@intel.com> Tested-by: Govindharajan, Hariprasad <hariprasad.govindharajan@intel.com> Tested-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> Acked-by: Ilya Maximets <i.maximets@ovn.org> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-12-15 16:41:28 +00:00
int socket_id = rte_lcore_to_socket_id(rte_get_main_lcore());
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
sizeof *dev->vhost_rxq_enabled);
if (!dev->vhost_rxq_enabled) {
return ENOMEM;
}
dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
if (!dev->tx_q) {
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
rte_free(dev->vhost_rxq_enabled);
return ENOMEM;
}
atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
DPDK_DEV_VHOST, socket_id);
}
static int
netdev_dpdk_vhost_construct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const char *name = netdev->name;
int err;
/* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
* the file system. '/' or '\' would traverse directories, so they're not
* acceptable in 'name'. */
if (strchr(name, '/') || strchr(name, '\\')) {
VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
"A valid name must not include '/' or '\\'",
name);
return EINVAL;
}
ovs_mutex_lock(&dpdk_mutex);
/* Take the name of the vhost-user port and append it to the location where
* the socket is to be created, then register the socket.
*/
dev->vhost_id = xasprintf("%s/%s", vhost_sock_dir, name);
dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
/* There is no support for multi-segments buffers. */
dev->vhost_driver_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
if (err) {
VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
dev->vhost_id);
goto out;
} else {
fatal_signal_add_file_to_unlink(dev->vhost_id);
VLOG_INFO("Socket %s created for vhost-user port %s\n",
dev->vhost_id, name);
}
err = rte_vhost_driver_callback_register(dev->vhost_id,
&virtio_net_device_ops);
if (err) {
VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
"port: %s\n", name);
goto out;
}
if (!userspace_tso_enabled()) {
err = rte_vhost_driver_disable_features(dev->vhost_id,
1ULL << VIRTIO_NET_F_HOST_TSO4
| 1ULL << VIRTIO_NET_F_HOST_TSO6
| 1ULL << VIRTIO_NET_F_CSUM);
if (err) {
VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
"port: %s\n", name);
goto out;
}
}
err = rte_vhost_driver_start(dev->vhost_id);
if (err) {
VLOG_ERR("rte_vhost_driver_start failed for vhost user "
"port: %s\n", name);
goto out;
}
err = vhost_common_construct(netdev);
if (err) {
VLOG_ERR("vhost_common_construct failed for vhost user "
"port: %s\n", name);
}
out:
if (err) {
free(dev->vhost_id);
dev->vhost_id = NULL;
}
ovs_mutex_unlock(&dpdk_mutex);
VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
"please migrate to dpdkvhostuserclient ports.");
return err;
}
static int
netdev_dpdk_vhost_client_construct(struct netdev *netdev)
{
int err;
ovs_mutex_lock(&dpdk_mutex);
err = vhost_common_construct(netdev);
if (err) {
VLOG_ERR("vhost_common_construct failed for vhost user client"
"port: %s\n", netdev->name);
}
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static int
netdev_dpdk_construct(struct netdev *netdev)
{
int err;
ovs_mutex_lock(&dpdk_mutex);
err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
DPDK_DEV_ETH, SOCKET0);
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static void
common_destruct(struct netdev_dpdk *dev)
OVS_REQUIRES(dpdk_mutex)
OVS_EXCLUDED(dev->mutex)
{
rte_free(dev->tx_q);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dpdk_mp_put(dev->dpdk_mp);
ovs_list_remove(&dev->list_node);
free(ovsrcu_get_protected(struct ingress_policer *,
&dev->ingress_policer));
free(dev->sw_stats);
ovs_mutex_destroy(&dev->mutex);
}
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
static void dpdk_rx_steer_unconfigure(struct netdev_dpdk *);
static void
netdev_dpdk_destruct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dpdk_mutex);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
/* Destroy any rx-steering flows to allow RXQs to be removed. */
dpdk_rx_steer_unconfigure(dev);
rte_eth_dev_stop(dev->port_id);
dev->started = false;
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
if (dev->attached) {
bool dpdk_resources_still_used = false;
struct rte_eth_dev_info dev_info;
dpdk_port_t sibling_port_id;
/* Check if this netdev has siblings (i.e. shares DPDK resources) among
* other OVS netdevs. */
RTE_ETH_FOREACH_DEV_SIBLING (sibling_port_id, dev->port_id) {
struct netdev_dpdk *sibling;
/* RTE_ETH_FOREACH_DEV_SIBLING lists dev->port_id as part of the
* loop. */
if (sibling_port_id == dev->port_id) {
continue;
}
LIST_FOR_EACH (sibling, list_node, &dpdk_list) {
if (sibling->port_id != sibling_port_id) {
continue;
}
dpdk_resources_still_used = true;
break;
}
if (dpdk_resources_still_used) {
break;
}
}
/* Retrieve eth device data before closing it. */
rte_eth_dev_info_get(dev->port_id, &dev_info);
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
/* Remove the eth device. */
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
rte_eth_dev_close(dev->port_id);
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
/* Remove the rte device if no associated eth device is used by OVS.
* Note: any remaining eth devices associated to this rte device are
* closed by DPDK ethdev layer. */
if (!dpdk_resources_still_used) {
int ret = rte_dev_remove(dev_info.device);
if (ret < 0) {
VLOG_ERR("Device '%s' can not be detached: %s.",
dev->devargs, rte_strerror(-ret));
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
} else {
/* Device was closed and detached. */
VLOG_INFO("Device '%s' has been removed and detached",
dev->devargs);
}
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
} else {
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
/* Device was only closed. rte_dev_remove() was not called. */
VLOG_INFO("Device '%s' has been removed", dev->devargs);
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
}
}
netdev_dpdk_clear_xstats(dev);
free(dev->devargs);
common_destruct(dev);
ovs_mutex_unlock(&dpdk_mutex);
}
/* rte_vhost_driver_unregister() can call back destroy_device(), which will
* try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
* deadlock, none of the mutexes must be held while calling this function. */
static int
dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
char *vhost_id)
OVS_EXCLUDED(dpdk_mutex)
OVS_EXCLUDED(dev->mutex)
{
return rte_vhost_driver_unregister(vhost_id);
}
static void
netdev_dpdk_vhost_destruct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
char *vhost_id;
ovs_mutex_lock(&dpdk_mutex);
/* Guest becomes an orphan if still attached. */
if (netdev_dpdk_get_vid(dev) >= 0
&& !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
VLOG_ERR("Removing port '%s' while vhost device still attached.",
netdev->name);
VLOG_ERR("To restore connectivity after re-adding of port, VM on "
"socket '%s' must be restarted.", dev->vhost_id);
}
vhost_id = dev->vhost_id;
dev->vhost_id = NULL;
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
rte_free(dev->vhost_rxq_enabled);
common_destruct(dev);
ovs_mutex_unlock(&dpdk_mutex);
if (!vhost_id) {
goto out;
}
if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
netdev->name, vhost_id);
} else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
/* OVS server mode - remove this socket from list for deletion */
fatal_signal_remove_file_to_unlink(vhost_id);
}
out:
free(vhost_id);
}
static void
netdev_dpdk_dealloc(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
rte_free(dev);
}
static void
netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
free(dev->rte_xstats_names);
dev->rte_xstats_names = NULL;
dev->rte_xstats_names_size = 0;
free(dev->rte_xstats_ids);
dev->rte_xstats_ids = NULL;
dev->rte_xstats_ids_size = 0;
}
static const char *
netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
OVS_REQUIRES(dev->mutex)
{
if (id >= dev->rte_xstats_names_size) {
return "UNKNOWN";
}
return dev->rte_xstats_names[id].name;
}
static bool
is_queue_stat(const char *s)
{
uint16_t tmp;
return (s[0] == 'r' || s[0] == 't') &&
(ovs_scan(s + 1, "x_q%"SCNu16"_packets", &tmp) ||
ovs_scan(s + 1, "x_q%"SCNu16"_bytes", &tmp));
}
static void
netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
struct rte_eth_xstat_name *rte_xstats_names = NULL;
struct rte_eth_xstat *rte_xstats = NULL;
int rte_xstats_names_size;
int rte_xstats_len;
const char *name;
uint64_t id;
netdev_dpdk_clear_xstats(dev);
rte_xstats_names_size = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
if (rte_xstats_names_size < 0) {
VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
rte_xstats_names = xcalloc(rte_xstats_names_size,
sizeof *rte_xstats_names);
rte_xstats_len = rte_eth_xstats_get_names(dev->port_id,
rte_xstats_names,
rte_xstats_names_size);
if (rte_xstats_len < 0 || rte_xstats_len != rte_xstats_names_size) {
VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
rte_xstats = xcalloc(rte_xstats_names_size, sizeof *rte_xstats);
rte_xstats_len = rte_eth_xstats_get(dev->port_id, rte_xstats,
rte_xstats_names_size);
if (rte_xstats_len < 0 || rte_xstats_len != rte_xstats_names_size) {
VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
dev->rte_xstats_names = rte_xstats_names;
rte_xstats_names = NULL;
dev->rte_xstats_names_size = rte_xstats_names_size;
dev->rte_xstats_ids = xcalloc(rte_xstats_names_size,
sizeof *dev->rte_xstats_ids);
for (unsigned int i = 0; i < rte_xstats_names_size; i++) {
id = rte_xstats[i].id;
name = netdev_dpdk_get_xstat_name(dev, id);
/* For custom stats, we filter out everything except per rxq/txq basic
* stats, and dropped, error and management counters. */
if (is_queue_stat(name) ||
string_ends_with(name, "_errors") ||
strstr(name, "_management_") ||
string_ends_with(name, "_dropped")) {
dev->rte_xstats_ids[dev->rte_xstats_ids_size] = id;
dev->rte_xstats_ids_size++;
}
}
out:
free(rte_xstats);
free(rte_xstats_names);
}
static bool
dpdk_port_is_representor(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
struct rte_eth_dev_info dev_info;
rte_eth_dev_info_get(dev->port_id, &dev_info);
return (*dev_info.dev_flags) & RTE_ETH_DEV_REPRESENTOR;
}
static int
netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->devargs && dev->devargs[0]) {
smap_add_format(args, "dpdk-devargs", "%s", dev->devargs);
}
smap_add_format(args, "n_rxq", "%d", dev->user_n_rxq);
if (dev->fc_conf.mode == RTE_ETH_FC_TX_PAUSE ||
dev->fc_conf.mode == RTE_ETH_FC_FULL) {
smap_add(args, "rx-flow-ctrl", "true");
}
if (dev->fc_conf.mode == RTE_ETH_FC_RX_PAUSE ||
dev->fc_conf.mode == RTE_ETH_FC_FULL) {
smap_add(args, "tx-flow-ctrl", "true");
}
if (dev->fc_conf.autoneg) {
smap_add(args, "flow-ctrl-autoneg", "true");
}
smap_add_format(args, "n_rxq_desc", "%d", dev->rxq_size);
smap_add_format(args, "n_txq_desc", "%d", dev->txq_size);
if (dev->rx_steer_flags == DPDK_RX_STEER_LACP) {
smap_add(args, "rx-steering", "rss+lacp");
}
smap_add(args, "dpdk-lsc-interrupt",
dev->lsc_interrupt_mode ? "true" : "false");
if (dpdk_port_is_representor(dev)) {
smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT,
ETH_ADDR_ARGS(dev->requested_hwaddr));
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static struct netdev_dpdk *
netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
OVS_REQUIRES(dpdk_mutex)
{
struct netdev_dpdk *dev;
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (dev->port_id == port_id) {
return dev;
}
}
return NULL;
}
static dpdk_port_t
netdev_dpdk_get_port_by_mac(const char *mac_str)
{
dpdk_port_t port_id;
struct eth_addr mac, port_mac;
if (!eth_addr_from_string(mac_str, &mac)) {
VLOG_ERR("invalid mac: %s", mac_str);
return DPDK_ETH_PORT_ID_INVALID;
}
RTE_ETH_FOREACH_DEV (port_id) {
struct rte_ether_addr ea;
rte_eth_macaddr_get(port_id, &ea);
memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
if (eth_addr_equals(mac, port_mac)) {
return port_id;
}
}
return DPDK_ETH_PORT_ID_INVALID;
}
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
/* Return the first DPDK port id matching the devargs pattern. */
static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs)
OVS_REQUIRES(dpdk_mutex)
{
dpdk_port_t port_id;
struct rte_dev_iterator iterator;
RTE_ETH_FOREACH_MATCHING_DEV (port_id, devargs, &iterator) {
/* If a break is done - must call rte_eth_iterator_cleanup. */
rte_eth_iterator_cleanup(&iterator);
break;
}
return port_id;
}
/*
* Normally, a PCI id (optionally followed by a representor identifier)
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
* is enough for identifying a specific DPDK port.
* However, for some NICs having multiple ports sharing the same PCI
* id, using PCI id won't work then.
*
* To fix that, here one more method is introduced: "class=eth,mac=$MAC".
*
* Note that the compatibility is fully kept: user can still use the
* PCI id for adding ports (when it's enough for them).
*/
static dpdk_port_t
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
const char *devargs, char **errp)
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
OVS_REQUIRES(dpdk_mutex)
{
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
dpdk_port_t new_port_id;
if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
} else {
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
if (!rte_eth_dev_is_valid_port(new_port_id)) {
/* Device not found in DPDK, attempt to attach it */
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
if (rte_dev_probe(devargs)) {
new_port_id = DPDK_ETH_PORT_ID_INVALID;
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
} else {
new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
if (rte_eth_dev_is_valid_port(new_port_id)) {
/* Attach successful */
dev->attached = true;
VLOG_INFO("Device '%s' attached to DPDK", devargs);
} else {
/* Attach unsuccessful */
new_port_id = DPDK_ETH_PORT_ID_INVALID;
}
}
}
}
if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
}
return new_port_id;
}
netdev-dpdk: Trigger port reconfiguration in main thread for resets. When OVS (main thread) configures a DPDK netdev, it holds a netdev_dpdk mutex lock. As part of this configure operation, the net/iavf driver (used with i40e VF devices) triggers a queue count change. The PF entity (serviced by a kernel PF driver for example) handles this change and requests back that the VF driver resets the VF device. The driver then completes the VF reset operation on its side and waits for completion of the iavf-event thread responsible for handling various VF device events. On the other hand, handling of the VF reset request in this iavf-event thread results in notifying the application with a port reset request (RTE_ETH_EVENT_INTR_RESET). The OVS reset callback tries to take a hold of the same netdev_dpdk mutex and blocks the iavf-event thread. As a result, the net/iavf driver (still running on OVS main thread) is unable to complete as it is waiting for iavf-event to complete. To break from this situation, the OVS reset callback now won't take a netdev_dpdk mutex. Instead, the port reset request is stored in a simple RTE_ETH_MAXPORTS array associated to a seq object. This is enough to let the VF driver complete this port initialization. The OVS main thread later handles the port reset request. More details in the DPDK upstream bz as this issue appeared following a change in DPDK. Link: https://bugs.dpdk.org/show_bug.cgi?id=1337 Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-01-18 17:15:00 +01:00
static struct seq *netdev_dpdk_reset_seq;
static uint64_t netdev_dpdk_last_reset_seq;
static atomic_bool netdev_dpdk_pending_reset[RTE_MAX_ETHPORTS];
static void
netdev_dpdk_wait(const struct netdev_class *netdev_class OVS_UNUSED)
{
uint64_t last_reset_seq = seq_read(netdev_dpdk_reset_seq);
if (netdev_dpdk_last_reset_seq == last_reset_seq) {
seq_wait(netdev_dpdk_reset_seq, netdev_dpdk_last_reset_seq);
} else {
poll_immediate_wake();
}
}
static void
netdev_dpdk_run(const struct netdev_class *netdev_class OVS_UNUSED)
{
uint64_t reset_seq = seq_read(netdev_dpdk_reset_seq);
if (reset_seq != netdev_dpdk_last_reset_seq) {
dpdk_port_t port_id;
netdev_dpdk_last_reset_seq = reset_seq;
for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) {
struct netdev_dpdk *dev;
bool pending_reset;
atomic_read_relaxed(&netdev_dpdk_pending_reset[port_id],
&pending_reset);
if (!pending_reset) {
continue;
}
atomic_store_relaxed(&netdev_dpdk_pending_reset[port_id], false);
ovs_mutex_lock(&dpdk_mutex);
dev = netdev_dpdk_lookup_by_port_id(port_id);
if (dev) {
ovs_mutex_lock(&dev->mutex);
dev->reset_needed = true;
netdev_request_reconfigure(&dev->up);
VLOG_DBG_RL(&rl, "%s: Device reset requested.",
netdev_get_name(&dev->up));
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
}
}
}
static int
dpdk_eth_event_callback(dpdk_port_t port_id, enum rte_eth_event_type type,
void *param OVS_UNUSED, void *ret_param OVS_UNUSED)
{
switch ((int) type) {
case RTE_ETH_EVENT_INTR_RESET:
netdev-dpdk: Trigger port reconfiguration in main thread for resets. When OVS (main thread) configures a DPDK netdev, it holds a netdev_dpdk mutex lock. As part of this configure operation, the net/iavf driver (used with i40e VF devices) triggers a queue count change. The PF entity (serviced by a kernel PF driver for example) handles this change and requests back that the VF driver resets the VF device. The driver then completes the VF reset operation on its side and waits for completion of the iavf-event thread responsible for handling various VF device events. On the other hand, handling of the VF reset request in this iavf-event thread results in notifying the application with a port reset request (RTE_ETH_EVENT_INTR_RESET). The OVS reset callback tries to take a hold of the same netdev_dpdk mutex and blocks the iavf-event thread. As a result, the net/iavf driver (still running on OVS main thread) is unable to complete as it is waiting for iavf-event to complete. To break from this situation, the OVS reset callback now won't take a netdev_dpdk mutex. Instead, the port reset request is stored in a simple RTE_ETH_MAXPORTS array associated to a seq object. This is enough to let the VF driver complete this port initialization. The OVS main thread later handles the port reset request. More details in the DPDK upstream bz as this issue appeared following a change in DPDK. Link: https://bugs.dpdk.org/show_bug.cgi?id=1337 Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-01-18 17:15:00 +01:00
atomic_store_relaxed(&netdev_dpdk_pending_reset[port_id], true);
seq_change(netdev_dpdk_reset_seq);
break;
default:
/* Ignore all other types. */
break;
netdev-dpdk: Trigger port reconfiguration in main thread for resets. When OVS (main thread) configures a DPDK netdev, it holds a netdev_dpdk mutex lock. As part of this configure operation, the net/iavf driver (used with i40e VF devices) triggers a queue count change. The PF entity (serviced by a kernel PF driver for example) handles this change and requests back that the VF driver resets the VF device. The driver then completes the VF reset operation on its side and waits for completion of the iavf-event thread responsible for handling various VF device events. On the other hand, handling of the VF reset request in this iavf-event thread results in notifying the application with a port reset request (RTE_ETH_EVENT_INTR_RESET). The OVS reset callback tries to take a hold of the same netdev_dpdk mutex and blocks the iavf-event thread. As a result, the net/iavf driver (still running on OVS main thread) is unable to complete as it is waiting for iavf-event to complete. To break from this situation, the OVS reset callback now won't take a netdev_dpdk mutex. Instead, the port reset request is stored in a simple RTE_ETH_MAXPORTS array associated to a seq object. This is enough to let the VF driver complete this port initialization. The OVS main thread later handles the port reset request. More details in the DPDK upstream bz as this issue appeared following a change in DPDK. Link: https://bugs.dpdk.org/show_bug.cgi?id=1337 Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-01-18 17:15:00 +01:00
}
return 0;
}
static void
dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
OVS_REQUIRES(dev->mutex)
{
int new_n_rxq;
new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
if (new_n_rxq != dev->user_n_rxq) {
dev->user_n_rxq = new_n_rxq;
netdev_request_reconfigure(&dev->up);
}
}
static void
dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
struct rte_eth_dev_info *info, bool is_rx)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_desc_lim *lim;
int default_size, queue_size, cur_size, new_requested_size;
int *cur_requested_size;
bool reconfig = false;
if (is_rx) {
default_size = NIC_PORT_DEFAULT_RXQ_SIZE;
new_requested_size = smap_get_int(args, "n_rxq_desc", default_size);
cur_size = dev->rxq_size;
cur_requested_size = &dev->requested_rxq_size;
lim = info ? &info->rx_desc_lim : NULL;
} else {
default_size = NIC_PORT_DEFAULT_TXQ_SIZE;
new_requested_size = smap_get_int(args, "n_txq_desc", default_size);
cur_size = dev->txq_size;
cur_requested_size = &dev->requested_txq_size;
lim = info ? &info->tx_desc_lim : NULL;
}
queue_size = new_requested_size;
/* Check for OVS limits. */
if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
|| !is_pow2(queue_size)) {
queue_size = default_size;
}
if (lim) {
/* Check for device limits. */
if (lim->nb_align) {
queue_size = ROUND_UP(queue_size, lim->nb_align);
}
queue_size = MIN(queue_size, lim->nb_max);
queue_size = MAX(queue_size, lim->nb_min);
}
*cur_requested_size = queue_size;
if (cur_size != queue_size) {
netdev_request_reconfigure(netdev);
reconfig = true;
}
if (new_requested_size != queue_size) {
VLOG(reconfig ? VLL_INFO : VLL_DBG,
"%s: Unable to set the number of %s descriptors to %d. "
"Adjusted to %d.", netdev_get_name(netdev),
is_rx ? "rx": "tx", new_requested_size, queue_size);
}
}
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
static void
dpdk_set_rx_steer_config(struct netdev *netdev, struct netdev_dpdk *dev,
const struct smap *args, char **errp)
{
const char *arg = smap_get_def(args, "rx-steering", "rss");
uint64_t flags = 0;
if (!strcmp(arg, "rss+lacp")) {
flags = DPDK_RX_STEER_LACP;
} else if (strcmp(arg, "rss")) {
VLOG_WARN_BUF(errp, "%s: options:rx-steering "
"unsupported parameter value '%s'",
netdev_get_name(netdev), arg);
}
if (flags && dev->type != DPDK_DEV_ETH) {
VLOG_WARN_BUF(errp, "%s: options:rx-steering "
"is only supported on ethernet ports",
netdev_get_name(netdev));
flags = 0;
}
if (flags && netdev_is_flow_api_enabled()) {
VLOG_WARN_BUF(errp, "%s: options:rx-steering "
"is incompatible with hw-offload",
netdev_get_name(netdev));
flags = 0;
}
if (flags != dev->requested_rx_steer_flags) {
dev->requested_rx_steer_flags = flags;
netdev_request_reconfigure(netdev);
}
}
static int
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
char **errp)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
bool flow_control_requested = true;
enum rte_eth_fc_mode fc_mode;
static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
{RTE_ETH_FC_NONE, RTE_ETH_FC_TX_PAUSE},
{RTE_ETH_FC_RX_PAUSE, RTE_ETH_FC_FULL }
};
struct rte_eth_dev_info info;
const char *new_devargs;
const char *vf_mac;
int err = 0;
int ret;
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
dpdk_set_rx_steer_config(netdev, dev, args, errp);
dpdk_set_rxq_config(dev, args);
new_devargs = smap_get(args, "dpdk-devargs");
if (dev->devargs && new_devargs && strcmp(new_devargs, dev->devargs)) {
/* The user requested a new device. If we return error, the caller
* will delete this netdev and try to recreate it. */
err = EAGAIN;
goto out;
}
/* dpdk-devargs is required for device configuration */
if (new_devargs && new_devargs[0]) {
/* Don't process dpdk-devargs if value is unchanged and port id
* is valid */
if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
&& rte_eth_dev_is_valid_port(dev->port_id))) {
dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
new_devargs,
errp);
if (!rte_eth_dev_is_valid_port(new_port_id)) {
err = EINVAL;
} else if (new_port_id == dev->port_id) {
/* Already configured, do not reconfigure again */
err = 0;
} else {
struct netdev_dpdk *dup_dev;
dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
if (dup_dev) {
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
"which is already in use by '%s'",
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
netdev_get_name(netdev), new_devargs,
netdev_get_name(&dup_dev->up));
err = EADDRINUSE;
} else {
int sid = rte_eth_dev_socket_id(new_port_id);
dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
dev->devargs = xstrdup(new_devargs);
dev->port_id = new_port_id;
netdev_request_reconfigure(&dev->up);
err = 0;
}
}
}
} else {
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
"The old 'dpdk<port_id>' names are not supported",
netdev_get_name(netdev));
err = EINVAL;
}
if (err) {
goto out;
}
ret = rte_eth_dev_info_get(dev->port_id, &info);
dpdk_process_queue_size(netdev, args, !ret ? &info : NULL, true);
dpdk_process_queue_size(netdev, args, !ret ? &info : NULL, false);
vf_mac = smap_get(args, "dpdk-vf-mac");
if (vf_mac) {
struct eth_addr mac;
if (!dpdk_port_is_representor(dev)) {
VLOG_WARN("'%s' is trying to set the VF MAC '%s' "
"but 'options:dpdk-vf-mac' is only supported for "
"VF representors.",
netdev_get_name(netdev), vf_mac);
} else if (!eth_addr_from_string(vf_mac, &mac)) {
VLOG_WARN("interface '%s': cannot parse VF MAC '%s'.",
netdev_get_name(netdev), vf_mac);
} else if (eth_addr_is_multicast(mac)) {
VLOG_WARN("interface '%s': cannot set VF MAC to multicast "
"address '%s'.", netdev_get_name(netdev), vf_mac);
} else if (!eth_addr_equals(dev->requested_hwaddr, mac)) {
dev->requested_hwaddr = mac;
netdev_request_reconfigure(netdev);
}
}
lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false);
if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
netdev_request_reconfigure(netdev);
}
rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
if (!smap_get(args, "rx-flow-ctrl") && !smap_get(args, "tx-flow-ctrl")
&& !smap_get(args, "flow-ctrl-autoneg")) {
/* FIXME: User didn't ask for flow control configuration.
* For now we'll not print a warning if flow control is not
* supported by the DPDK port. */
flow_control_requested = false;
}
/* Get the Flow control configuration. */
err = -rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
if (err) {
if (err == ENOTSUP) {
if (flow_control_requested) {
VLOG_WARN("%s: Flow control is not supported.",
netdev_get_name(netdev));
}
err = 0; /* Not fatal. */
} else {
VLOG_WARN_BUF(errp, "%s: Cannot get flow control parameters: %s",
netdev_get_name(netdev), rte_strerror(err));
}
goto out;
}
if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
dev->fc_conf.mode = fc_mode;
dev->fc_conf.autoneg = autoneg;
dpdk_eth_flow_ctrl_setup(dev);
}
out:
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static int
netdev_dpdk_vhost_client_get_config(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int tx_retries_max;
ovs_mutex_lock(&dev->mutex);
if (dev->vhost_id) {
smap_add(args, "vhost-server-path", dev->vhost_id);
}
atomic_read_relaxed(&dev->vhost_tx_retries_max, &tx_retries_max);
if (tx_retries_max != VHOST_ENQ_RETRY_DEF) {
smap_add_format(args, "tx-retries-max", "%d", tx_retries_max);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
const struct smap *args,
char **errp OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const char *path;
int max_tx_retries, cur_max_tx_retries;
ovs_mutex_lock(&dev->mutex);
if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
path = smap_get(args, "vhost-server-path");
if (!nullable_string_is_equal(path, dev->vhost_id)) {
free(dev->vhost_id);
dev->vhost_id = nullable_xstrdup(path);
netdev_request_reconfigure(netdev);
}
}
max_tx_retries = smap_get_int(args, "tx-retries-max",
VHOST_ENQ_RETRY_DEF);
if (max_tx_retries < VHOST_ENQ_RETRY_MIN
|| max_tx_retries > VHOST_ENQ_RETRY_MAX) {
max_tx_retries = VHOST_ENQ_RETRY_DEF;
}
atomic_read_relaxed(&dev->vhost_tx_retries_max, &cur_max_tx_retries);
if (max_tx_retries != cur_max_tx_retries) {
atomic_store_relaxed(&dev->vhost_tx_retries_max, max_tx_retries);
VLOG_INFO("Max Tx retries for vhost device '%s' set to %d",
netdev_get_name(netdev), max_tx_retries);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_numa_id(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
return dev->socket_id;
}
/* Sets the number of tx queues for the dpdk interface. */
static int
netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->requested_n_txq == n_txq) {
goto out;
}
dev->requested_n_txq = n_txq;
netdev_request_reconfigure(netdev);
out:
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static struct netdev_rxq *
netdev_dpdk_rxq_alloc(void)
{
struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
if (rx) {
return &rx->up;
}
return NULL;
}
static struct netdev_rxq_dpdk *
netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
{
return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
}
static int
netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
ovs_mutex_lock(&dev->mutex);
rx->port_id = dev->port_id;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static void
netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
{
}
static void
netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
rte_free(rx);
}
/* Prepare the packet for HWOL.
* Return True if the packet is OK to continue. */
static bool
netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
{
struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
const uint64_t all_inner_requests = (RTE_MBUF_F_TX_IP_CKSUM |
RTE_MBUF_F_TX_L4_MASK |
RTE_MBUF_F_TX_TCP_SEG);
const uint64_t all_outer_requests = (RTE_MBUF_F_TX_OUTER_IP_CKSUM |
RTE_MBUF_F_TX_OUTER_UDP_CKSUM);
const uint64_t all_requests = all_inner_requests | all_outer_requests;
const uint64_t all_inner_marks = (RTE_MBUF_F_TX_IPV4 |
RTE_MBUF_F_TX_IPV6);
const uint64_t all_outer_marks = (RTE_MBUF_F_TX_OUTER_IPV4 |
RTE_MBUF_F_TX_OUTER_IPV6 |
RTE_MBUF_F_TX_TUNNEL_MASK);
const uint64_t all_marks = all_inner_marks | all_outer_marks;
if (!(mbuf->ol_flags & all_requests)) {
/* No offloads requested, no marks should be set. */
mbuf->ol_flags &= ~all_marks;
uint64_t unexpected = mbuf->ol_flags & RTE_MBUF_F_TX_OFFLOAD_MASK;
if (OVS_UNLIKELY(unexpected)) {
VLOG_WARN_RL(&rl, "%s: Unexpected Tx offload flags: %#"PRIx64,
netdev_get_name(&dev->up), unexpected);
netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up),
"Packet with unexpected ol_flags", mbuf);
return false;
}
return true;
}
ovs_assert(dp_packet_l4(pkt));
/* If packet is vxlan or geneve tunnel packet, calculate outer
* l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
* before. */
const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK;
if (OVS_UNLIKELY(tunnel_type &&
tunnel_type != RTE_MBUF_F_TX_TUNNEL_GENEVE &&
tunnel_type != RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
VLOG_WARN_RL(&rl, "%s: Unexpected tunnel type: %#"PRIx64,
netdev_get_name(&dev->up), tunnel_type);
netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up),
"Packet with unexpected tunnel type", mbuf);
return false;
}
if (tunnel_type && (mbuf->ol_flags & all_inner_requests)) {
if (mbuf->ol_flags & all_outer_requests) {
mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
(char *) dp_packet_eth(pkt);
mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
(char *) dp_packet_l3(pkt);
} else {
/* If no outer offloading is requested, clear outer marks. */
mbuf->ol_flags &= ~all_outer_marks;
mbuf->outer_l2_len = 0;
mbuf->outer_l3_len = 0;
/* Skip outer headers. */
mbuf->l2_len += (char *) dp_packet_l4(pkt) -
(char *) dp_packet_eth(pkt);
}
} else {
if (tunnel_type) {
/* No inner offload is requested, fallback to non tunnel
* checksum offloads. */
mbuf->ol_flags &= ~all_inner_marks;
if (mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_IP_CKSUM) {
mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
mbuf->ol_flags |= RTE_MBUF_F_TX_IPV4;
}
if (mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM) {
mbuf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
mbuf->ol_flags |= mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_IPV4
? RTE_MBUF_F_TX_IPV4 : RTE_MBUF_F_TX_IPV6;
}
mbuf->ol_flags &= ~(all_outer_requests | all_outer_marks);
}
mbuf->outer_l2_len = 0;
mbuf->outer_l3_len = 0;
mbuf->l2_len = (char *) dp_packet_l3(pkt) -
(char *) dp_packet_eth(pkt);
mbuf->l3_len = (char *) dp_packet_l4(pkt) -
(char *) dp_packet_l3(pkt);
}
if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
struct tcp_header *th = dp_packet_l4(pkt);
int hdr_len;
if (tunnel_type) {
mbuf->tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len -
mbuf->l4_len - mbuf->outer_l3_len;
} else {
mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
}
hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) {
VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", "
"gso: %"PRIu32", max len: %"PRIu32"",
dev->up.name, hdr_len, mbuf->tso_segsz,
dev->max_packet_len);
return false;
}
}
/* If L4 checksum is requested, IPv4 should be requested as well. */
if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK
&& mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
}
return true;
}
/* Prepare a batch for HWOL.
* Return the number of good packets in the batch. */
static int
netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int pkt_cnt)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt;
/* Prepare and filter bad HWOL packets. */
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
if (!netdev_dpdk_prep_hwol_packet(dev, pkt)) {
rte_pktmbuf_free(pkt);
continue;
}
if (OVS_UNLIKELY(i != cnt)) {
pkts[cnt] = pkt;
}
cnt++;
}
return cnt;
}
static void
netdev_dpdk_mbuf_dump(const char *prefix, const char *message,
const struct rte_mbuf *mbuf)
{
static struct vlog_rate_limit dump_rl = VLOG_RATE_LIMIT_INIT(5, 5);
char *response = NULL;
FILE *stream;
size_t size;
if (VLOG_DROP_DBG(&dump_rl)) {
return;
}
stream = open_memstream(&response, &size);
if (!stream) {
VLOG_ERR("Unable to open memstream for mbuf dump: %s.",
ovs_strerror(errno));
return;
}
rte_pktmbuf_dump(stream, mbuf, rte_pktmbuf_pkt_len(mbuf));
fclose(stream);
VLOG_DBG(prefix ? "%s: %s:\n%s" : "%s%s:\n%s",
prefix ? prefix : "", message, response);
free(response);
}
/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
* 'pkts', even in case of failure.
*
* Returns the number of packets that weren't transmitted. */
static inline int
netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
struct rte_mbuf **pkts, int cnt)
{
uint32_t nb_tx = 0;
uint16_t nb_tx_prep = cnt;
nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
if (nb_tx_prep != cnt) {
VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
"Only %u/%u are valid: %s", netdev_get_name(&dev->up),
nb_tx_prep, cnt, rte_strerror(rte_errno));
netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up),
"First invalid packet", pkts[nb_tx_prep]);
}
while (nb_tx != nb_tx_prep) {
uint32_t ret;
ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx,
nb_tx_prep - nb_tx);
if (!ret) {
break;
}
nb_tx += ret;
}
if (OVS_UNLIKELY(nb_tx != cnt)) {
/* Free buffers, which we couldn't transmit. */
rte_pktmbuf_free_bulk(&pkts[nb_tx], cnt - nb_tx);
}
return cnt - nb_tx;
}
static inline bool
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm *meter,
struct rte_meter_srtcm_profile *profile,
struct rte_mbuf *pkt, uint64_t time)
{
uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct rte_ether_hdr);
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
return rte_meter_srtcm_color_blind_check(meter, profile, time, pkt_len) ==
RTE_COLOR_GREEN;
}
static int
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
srtcm_policer_run_single_packet(struct rte_meter_srtcm *meter,
struct rte_meter_srtcm_profile *profile,
struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt = NULL;
uint64_t current_time = rte_rdtsc();
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
/* Handle current packet */
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
if (netdev_dpdk_srtcm_policer_pkt_handle(meter, profile,
pkt, current_time)) {
if (cnt != i) {
pkts[cnt] = pkt;
}
cnt++;
} else {
if (should_steal) {
rte_pktmbuf_free(pkt);
}
}
}
return cnt;
}
static int
ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
int pkt_cnt, bool should_steal)
{
int cnt = 0;
rte_spinlock_lock(&policer->policer_lock);
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
cnt = srtcm_policer_run_single_packet(&policer->in_policer,
&policer->in_prof,
pkts, pkt_cnt, should_steal);
rte_spinlock_unlock(&policer->policer_lock);
return cnt;
}
static bool
is_vhost_running(struct netdev_dpdk *dev)
{
return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
}
/*
* The receive path for the vhost port is the TX path out from guest.
*/
static int
netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
struct dp_packet_batch *batch, int *qfill)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
uint16_t nb_rx = 0;
uint16_t qos_drops = 0;
int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
netdev-dpdk: Fix calling vhost API with negative vid. Currently, rx and tx functions for vhost interfaces always obtain 'vid' twice. First time inside 'is_vhost_running' for checking the value and the second time in enqueue/dequeue function calls to send/receive packets. But second time we're not checking the returned value. If vhost device will be destroyed between checking and enqueue/dequeue, DPDK API will be called with '-1' instead of valid 'vid'. DPDK API does not validate the 'vid'. This leads to getting random memory value as a pointer to internal device structure inside DPDK. Access by this pointer leads to segmentation fault. For example: |00503|dpdk|INFO|VHOST_CONFIG: read message VHOST_USER_GET_VRING_BASE [New Thread 0x7fb6754910 (LWP 21246)] Program received signal SIGSEGV, Segmentation fault. rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 630 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) (gdb) bt full #0 rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 dev = 0xffffffff #1 __netdev_dpdk_vhost_send at lib/netdev-dpdk.c:1803 tx_pkts = <optimized out> cur_pkts = 0x7f340084f0 total_pkts = 32 dropped = 0 i = <optimized out> retries = 0 ... (gdb) p *((struct netdev_dpdk *) netdev) $8 = { ... , flags = (NETDEV_UP | NETDEV_PROMISC), ... , vid = {v = -1}, vhost_reconfigured = false, ... } Issue can be reproduced by stopping DPDK application (testpmd) inside guest while heavy traffic flows to this VM. Fix that by obtaining and checking the 'vid' only once. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 0a0f39df1d5a ("netdev-dpdk: Add support for DPDK 16.07") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Billy O'Mahony <billy.o.mahony@intel.com> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-10-06 13:50:14 +03:00
int vid = netdev_dpdk_get_vid(dev);
netdev-dpdk: Fix calling vhost API with negative vid. Currently, rx and tx functions for vhost interfaces always obtain 'vid' twice. First time inside 'is_vhost_running' for checking the value and the second time in enqueue/dequeue function calls to send/receive packets. But second time we're not checking the returned value. If vhost device will be destroyed between checking and enqueue/dequeue, DPDK API will be called with '-1' instead of valid 'vid'. DPDK API does not validate the 'vid'. This leads to getting random memory value as a pointer to internal device structure inside DPDK. Access by this pointer leads to segmentation fault. For example: |00503|dpdk|INFO|VHOST_CONFIG: read message VHOST_USER_GET_VRING_BASE [New Thread 0x7fb6754910 (LWP 21246)] Program received signal SIGSEGV, Segmentation fault. rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 630 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) (gdb) bt full #0 rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 dev = 0xffffffff #1 __netdev_dpdk_vhost_send at lib/netdev-dpdk.c:1803 tx_pkts = <optimized out> cur_pkts = 0x7f340084f0 total_pkts = 32 dropped = 0 i = <optimized out> retries = 0 ... (gdb) p *((struct netdev_dpdk *) netdev) $8 = { ... , flags = (NETDEV_UP | NETDEV_PROMISC), ... , vid = {v = -1}, vhost_reconfigured = false, ... } Issue can be reproduced by stopping DPDK application (testpmd) inside guest while heavy traffic flows to this VM. Fix that by obtaining and checking the 'vid' only once. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 0a0f39df1d5a ("netdev-dpdk: Add support for DPDK 16.07") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Billy O'Mahony <billy.o.mahony@intel.com> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-10-06 13:50:14 +03:00
if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
|| !(dev->flags & NETDEV_UP))) {
return EAGAIN;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
(struct rte_mbuf **) batch->packets,
NETDEV_MAX_BURST);
if (!nb_rx) {
return EAGAIN;
}
if (qfill) {
if (nb_rx == NETDEV_MAX_BURST) {
/* The DPDK API returns a uint32_t which often has invalid bits in
* the upper 16-bits. Need to restrict the value to uint16_t. */
*qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
} else {
*qfill = 0;
}
}
if (policer) {
qos_drops = nb_rx;
nb_rx = ingress_policer_run(policer,
(struct rte_mbuf **) batch->packets,
nb_rx, true);
qos_drops -= nb_rx;
}
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
if (OVS_UNLIKELY(qos_drops)) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.rx_dropped += qos_drops;
dev->sw_stats->rx_qos_drops += qos_drops;
rte_spinlock_unlock(&dev->stats_lock);
}
batch->count = nb_rx;
dp_packet_batch_init_packet_fields(batch);
return 0;
}
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
static bool
netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
return dev->vhost_rxq_enabled[rxq->queue_id];
}
static int
netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
int *qfill)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
int nb_rx;
int dropped = 0;
if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
return EAGAIN;
}
nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
(struct rte_mbuf **) batch->packets,
NETDEV_MAX_BURST);
if (!nb_rx) {
return EAGAIN;
}
if (policer) {
dropped = nb_rx;
nb_rx = ingress_policer_run(policer,
(struct rte_mbuf **) batch->packets,
nb_rx, true);
dropped -= nb_rx;
}
/* Update stats to reflect dropped packets */
if (OVS_UNLIKELY(dropped)) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.rx_dropped += dropped;
dev->sw_stats->rx_qos_drops += dropped;
rte_spinlock_unlock(&dev->stats_lock);
}
batch->count = nb_rx;
dp_packet_batch_init_packet_fields(batch);
if (qfill) {
if (nb_rx == NETDEV_MAX_BURST) {
*qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
} else {
*qfill = 0;
}
}
return 0;
}
static inline int
netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int cnt, bool should_steal)
{
struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
if (qos_conf) {
rte_spinlock_lock(&qos_conf->lock);
cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
rte_spinlock_unlock(&qos_conf->lock);
}
return cnt;
}
static int
netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int pkt_cnt)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt;
/* Filter oversized packets. The TSO packets are filtered out
* during the offloading preparation for performance reasons. */
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len)
&& !(pkt->ol_flags & RTE_MBUF_F_TX_TCP_SEG))) {
VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " "
"max_packet_len %d", dev->up.name, pkt->pkt_len,
dev->max_packet_len);
rte_pktmbuf_free(pkt);
continue;
}
if (OVS_UNLIKELY(i != cnt)) {
pkts[cnt] = pkt;
}
cnt++;
}
return cnt;
}
static void
netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
{
rte_free(opaque);
}
static struct rte_mbuf *
dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
{
uint32_t total_len = RTE_PKTMBUF_HEADROOM + data_len;
struct rte_mbuf_ext_shared_info *shinfo = NULL;
uint16_t buf_len;
void *buf;
total_len += sizeof *shinfo + sizeof(uintptr_t);
total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
VLOG_ERR("Can't copy packet: too big %u", total_len);
return NULL;
}
buf_len = total_len;
buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
if (OVS_UNLIKELY(buf == NULL)) {
VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
return NULL;
}
/* Initialize shinfo. */
shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
netdev_dpdk_extbuf_free,
buf);
if (OVS_UNLIKELY(shinfo == NULL)) {
rte_free(buf);
VLOG_ERR("Failed to initialize shared info for mbuf while "
"attempting to attach an external buffer.");
return NULL;
}
rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
shinfo);
rte_pktmbuf_reset_headroom(pkt);
return pkt;
}
static struct rte_mbuf *
dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len)
{
struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
if (OVS_UNLIKELY(!pkt)) {
return NULL;
}
if (rte_pktmbuf_tailroom(pkt) >= data_len) {
return pkt;
}
if (dpdk_pktmbuf_attach_extbuf(pkt, data_len)) {
return pkt;
}
rte_pktmbuf_free(pkt);
return NULL;
}
static struct dp_packet *
dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
{
struct rte_mbuf *mbuf_dest;
struct dp_packet *pkt_dest;
uint32_t pkt_len;
pkt_len = dp_packet_size(pkt_orig);
mbuf_dest = dpdk_pktmbuf_alloc(mp, pkt_len);
if (OVS_UNLIKELY(mbuf_dest == NULL)) {
return NULL;
}
pkt_dest = CONTAINER_OF(mbuf_dest, struct dp_packet, mbuf);
memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig), pkt_len);
dp_packet_set_size(pkt_dest, pkt_len);
mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT));
mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz;
memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
if (dp_packet_l3(pkt_dest)) {
if (dp_packet_eth(pkt_dest)) {
mbuf_dest->l2_len = (char *) dp_packet_l3(pkt_dest)
- (char *) dp_packet_eth(pkt_dest);
} else {
mbuf_dest->l2_len = 0;
}
if (dp_packet_l4(pkt_dest)) {
mbuf_dest->l3_len = (char *) dp_packet_l4(pkt_dest)
- (char *) dp_packet_l3(pkt_dest);
} else {
mbuf_dest->l3_len = 0;
}
}
return pkt_dest;
}
/* Replace packets in a 'batch' with their corresponding copies using
* DPDK memory.
*
* Returns the number of good packets in the batch. */
static size_t
dpdk_copy_batch_to_mbuf(struct netdev *netdev, struct dp_packet_batch *batch)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
size_t i, size = dp_packet_batch_size(batch);
struct dp_packet *packet;
DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
if (OVS_UNLIKELY(packet->source == DPBUF_DPDK)) {
dp_packet_batch_refill(batch, packet, i);
} else {
struct dp_packet *pktcopy;
pktcopy = dpdk_copy_dp_packet_to_mbuf(dev->dpdk_mp->mp, packet);
if (pktcopy) {
dp_packet_batch_refill(batch, pktcopy, i);
}
dp_packet_delete(packet);
}
}
netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads DPDK mempools rely on rte_lcore_id() to implement a thread-local cache. Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to the "thread-local" cache, causing crashes. This commit resolves the issue with the following changes: - Every non pmd thread has the same lcore_id (0, for management reasons), which is not shared with any pmd thread (lcore_id for pmd threads now start from 1) - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be held. - The previous change does not allow us anymore to pass DPDK mbufs to handler threads: therefore this commit partially revert 143859ec63d45e. Now packets are copied for upcall processing. We can remove the extra memcpy by processing upcalls in the pmd thread itself. With the introduction of the extra locking, the packet throughput will be lower in the following cases: - When using internal (tap) devices with DPDK devices on the same datapath. Anyway, to support internal devices efficiently, we needed DPDK KNI devices, which will be proper pmd devices and will not need this locking. - When packets are processed in the slow path by non pmd threads. This overhead can be avoided by handling the upcalls directly in pmd threads (a change that has already been proposed by Ryan Wilson) Also, the following two fixes have been introduced: - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put(). This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option - Do not bulk free mbufs in a transmission queue. They may belong to different mempools Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2014-07-17 14:29:36 -07:00
return dp_packet_batch_size(batch);
}
static size_t
netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch,
struct netdev_dpdk_sw_stats *stats)
{
struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
size_t cnt, pkt_cnt = dp_packet_batch_size(batch);
struct dp_packet *packet;
bool need_copy = false;
memset(stats, 0, sizeof *stats);
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
if (packet->source != DPBUF_DPDK) {
need_copy = true;
break;
}
}
/* Copy dp-packets to mbufs. */
if (OVS_UNLIKELY(need_copy)) {
cnt = dpdk_copy_batch_to_mbuf(netdev, batch);
stats->tx_failure_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
}
/* Drop oversized packets. */
cnt = netdev_dpdk_filter_packet_len(dev, pkts, pkt_cnt);
stats->tx_mtu_exceeded_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
/* Prepare each mbuf for hardware offloading. */
cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt);
stats->tx_invalid_hwol_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
/* Apply Quality of Service policy. */
cnt = netdev_dpdk_qos_run(dev, pkts, pkt_cnt, true);
stats->tx_qos_drops += pkt_cnt - cnt;
return cnt;
}
static int
netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch,
bool concurrent_txq OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int max_retries = VHOST_ENQ_RETRY_MIN;
int cnt, batch_cnt, vhost_batch_cnt;
int vid = netdev_dpdk_get_vid(dev);
struct netdev_dpdk_sw_stats stats;
struct rte_mbuf **pkts;
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
int dropped;
int retries;
batch_cnt = cnt = dp_packet_batch_size(batch);
qid = dev->tx_q[qid % netdev->n_txq].map;
if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
|| !(dev->flags & NETDEV_UP))) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += cnt;
rte_spinlock_unlock(&dev->stats_lock);
dp_packet_delete_batch(batch, true);
return 0;
}
if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) {
COVERAGE_INC(vhost_tx_contention);
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
}
cnt = netdev_dpdk_common_send(netdev, batch, &stats);
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
dropped = batch_cnt - cnt;
pkts = (struct rte_mbuf **) batch->packets;
vhost_batch_cnt = cnt;
retries = 0;
do {
int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
int tx_pkts;
tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, pkts, cnt);
if (OVS_LIKELY(tx_pkts)) {
/* Packets have been sent.*/
cnt -= tx_pkts;
/* Prepare for possible retry.*/
pkts = &pkts[tx_pkts];
if (OVS_UNLIKELY(cnt && !retries)) {
/*
* Read max retries as there are packets not sent
* and no retries have already occurred.
*/
atomic_read_relaxed(&dev->vhost_tx_retries_max, &max_retries);
}
} else {
/* No packets sent - do not retry.*/
break;
}
} while (cnt && (retries++ < max_retries));
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
stats.tx_failure_drops += cnt;
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
dropped += cnt;
stats.tx_retries = MIN(retries, max_retries);
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
if (OVS_UNLIKELY(dropped || stats.tx_retries)) {
struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dropped;
sw_stats->tx_retries += stats.tx_retries;
sw_stats->tx_failure_drops += stats.tx_failure_drops;
sw_stats->tx_mtu_exceeded_drops += stats.tx_mtu_exceeded_drops;
sw_stats->tx_qos_drops += stats.tx_qos_drops;
sw_stats->tx_invalid_hwol_drops += stats.tx_invalid_hwol_drops;
rte_spinlock_unlock(&dev->stats_lock);
}
pkts = (struct rte_mbuf **) batch->packets;
rte_pktmbuf_free_bulk(pkts, vhost_batch_cnt);
return 0;
}
static int
netdev_dpdk_eth_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch, bool concurrent_txq)
{
struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int batch_cnt = dp_packet_batch_size(batch);
struct netdev_dpdk_sw_stats stats;
int cnt, dropped;
if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dp_packet_batch_size(batch);
rte_spinlock_unlock(&dev->stats_lock);
dp_packet_delete_batch(batch, true);
return 0;
}
dpif-netdev: XPS (Transmit Packet Steering) implementation. If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
2016-07-27 17:44:41 +03:00
if (OVS_UNLIKELY(concurrent_txq)) {
qid = qid % dev->up.n_txq;
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
}
cnt = netdev_dpdk_common_send(netdev, batch, &stats);
dropped = netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt);
stats.tx_failure_drops += dropped;
dropped += batch_cnt - cnt;
if (OVS_UNLIKELY(dropped)) {
struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dropped;
sw_stats->tx_failure_drops += stats.tx_failure_drops;
sw_stats->tx_mtu_exceeded_drops += stats.tx_mtu_exceeded_drops;
sw_stats->tx_qos_drops += stats.tx_qos_drops;
sw_stats->tx_invalid_hwol_drops += stats.tx_invalid_hwol_drops;
rte_spinlock_unlock(&dev->stats_lock);
}
dpif-netdev: XPS (Transmit Packet Steering) implementation. If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
2016-07-27 17:44:41 +03:00
if (OVS_UNLIKELY(concurrent_txq)) {
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
}
return 0;
}
static int
netdev_dpdk_set_etheraddr__(struct netdev_dpdk *dev, const struct eth_addr mac)
OVS_REQUIRES(dev->mutex)
{
int err = 0;
if (dev->type == DPDK_DEV_ETH) {
struct rte_ether_addr ea;
memcpy(ea.addr_bytes, mac.ea, ETH_ADDR_LEN);
err = -rte_eth_dev_default_mac_addr_set(dev->port_id, &ea);
}
if (!err) {
dev->hwaddr = mac;
} else {
VLOG_WARN("%s: Failed to set requested mac("ETH_ADDR_FMT"): %s",
netdev_get_name(&dev->up), ETH_ADDR_ARGS(mac),
rte_strerror(err));
}
return err;
}
static int
netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err = 0;
ovs_mutex_lock(&dev->mutex);
if (!eth_addr_equals(dev->hwaddr, mac)) {
err = netdev_dpdk_set_etheraddr__(dev, mac);
if (!err) {
netdev_change_seq_changed(netdev);
}
}
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
*mac = dev->hwaddr;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
*mtup = dev->mtu;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
netdev-dpdk: Fix requested MTU size validation. This commit replaces MTU_TO_FRAME_LEN(mtu) with MTU_TO_MAX_FRAME_LEN(mtu) in netdev_dpdk_set_mtu(), in order to determine if the total length of the L2 frame with an MTU of ’mtu’ exceeds NETDEV_DPDK_MAX_PKT_LEN. When setting an MTU we first check if the requested total frame length (which includes associated L2 overhead) will exceed the maximum frame length supported in netdev_dpdk_set_mtu(). The frame length is calculated by MTU_TO_FRAME_LEN as MTU + ETHER_HEADER + ETHER_CRC. The MTU for the device will be set at a later stage in dpdk_eth_dev_init() using rte_eth_dev_set_mtu(mtu). However when using rte_eth_dev_set_mtu(mtu) the calculation used to check that the frame does not exceed the max frame length for that device varies between DPDK device drivers. For example ixgbe driver calculates the frame length for a given MTU as mtu + ETHER_HDR_LEN + ETHER_CRC_LEN i40e driver calculates it as mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + I40E_VLAN_TAG_SIZE * 2 em driver calculates it as mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + VLAN_TAG_SIZE Currently it is possible to set an MTU for a netdev_dpdk device that exceeds the upper limit MTU for that devices DPDK driver. This leads to a segfault. This is because the frame length comparison as is, does not take into account the addition of the vlan tag overhead expected in the drivers. The netdev_dpdk_set_mtu() call will incorrectly succeed but the subsequent dpdk_eth_dev_init() will fail before the queues have been created for the DPDK device. This coupled with assumptions regarding reconfiguration requirements for the netdev will lead to a segfault when the rxq is polled for this device. A simple way to avoid this is by using MTU_TO_MAX_FRAME_LEN(mtu) when validating a requested MTU in netdev_dpdk_set_mtu(). MTU_TO_MAX_FRAME_LEN(mtu) is equivalent to the following: mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + (2 * VLAN_HEADER_LEN) By using MTU_TO_MAX_FRAME_LEN at the netdev_dpdk_set_mtu() stage, OvS now takes into account the maximum L2 overhead that a DPDK driver could allow for in its frame size calculation. This allows OVS to flag an error rather than the DPDK driver if the frame length exceeds the max DPDK frame length. OVS can fail gracefully at this point and use the default MTU of 1500 to continue to configure the port. Note: this fix is a work around, a better approach would be if DPDK devices could report the maximum MTU value that can be requested on a per device basis. This capability however is not currently available. A downside of this patch is that the MTU upper limit will be reduced by 8 bytes for DPDK devices that do not need to account for vlan tags in the frame length driver calculations e.g. ixgbe devices upper MTU limit is reduced from the OVS point of view from 9710 to 9702. CC: Mark Kavanagh <mark.b.kavanagh@intel.com> Fixes: 0072e931 ("netdev-dpdk: add support for jumbo frames") Signed-off-by: Ian Stokes <ian.stokes@intel.com> Co-authored-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Acked-by: Flavio Leitner <fbl@sysclose.org>
2018-01-09 16:09:28 +00:00
/* XXX: Ensure that the overall frame length of the requested MTU does not
* surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
* the L2 frame length is calculated for a given MTU when
* rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
* headers, the em driver includes 1 x vlan header, the ixgbe driver does
* not include vlan headers. As such we should use
* MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
* (8 bytes) for comparison. This avoids a failure later with
* rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
* a method to retrieve the upper bound MTU for a given device.
*/
if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
|| mtu < RTE_ETHER_MIN_MTU) {
VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
return EINVAL;
}
ovs_mutex_lock(&dev->mutex);
if (dev->requested_mtu != mtu) {
dev->requested_mtu = mtu;
netdev_request_reconfigure(netdev);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
struct netdev_stats *stats)
{
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
struct rte_vhost_stat_name *vhost_stats_names = NULL;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
struct rte_vhost_stat *vhost_stats = NULL;
int vhost_stats_count;
int err;
int qid;
int vid;
ovs_mutex_lock(&dev->mutex);
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
if (!is_vhost_running(dev)) {
err = EPROTO;
goto out;
}
vid = netdev_dpdk_get_vid(dev);
/* We expect all rxqs have the same number of stats, only query rxq0. */
qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
err = EPROTO;
goto out;
}
vhost_stats_count = err;
vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats);
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
#define VHOST_RXQ_STATS \
VHOST_RXQ_STAT(rx_packets, "good_packets") \
VHOST_RXQ_STAT(rx_bytes, "good_bytes") \
VHOST_RXQ_STAT(rx_broadcast_packets, "broadcast_packets") \
VHOST_RXQ_STAT(multicast, "multicast_packets") \
VHOST_RXQ_STAT(rx_undersized_errors, "undersize_packets") \
VHOST_RXQ_STAT(rx_1_to_64_packets, "size_64_packets") \
VHOST_RXQ_STAT(rx_65_to_127_packets, "size_65_127_packets") \
VHOST_RXQ_STAT(rx_128_to_255_packets, "size_128_255_packets") \
VHOST_RXQ_STAT(rx_256_to_511_packets, "size_256_511_packets") \
VHOST_RXQ_STAT(rx_512_to_1023_packets, "size_512_1023_packets") \
VHOST_RXQ_STAT(rx_1024_to_1522_packets, "size_1024_1518_packets") \
VHOST_RXQ_STAT(rx_1523_to_max_packets, "size_1519_max_packets")
#define VHOST_RXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0;
VHOST_RXQ_STATS;
#undef VHOST_RXQ_STAT
for (int q = 0; q < dev->up.n_rxq; q++) {
qid = q * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
for (int i = 0; i < vhost_stats_count; i++) {
#define VHOST_RXQ_STAT(MEMBER, NAME) \
if (string_ends_with(vhost_stats_names[i].name, NAME)) { \
dev->stats.MEMBER += vhost_stats[i].value; \
continue; \
}
VHOST_RXQ_STATS;
#undef VHOST_RXQ_STAT
}
}
/* OVS reports 64 bytes and smaller packets into "rx_1_to_64_packets".
* Since vhost only reports good packets and has no error counter,
* rx_undersized_errors is highjacked (see above) to retrieve
* "undersize_packets". */
dev->stats.rx_1_to_64_packets += dev->stats.rx_undersized_errors;
memset(&dev->stats.rx_undersized_errors, 0xff,
sizeof dev->stats.rx_undersized_errors);
#define VHOST_RXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER;
VHOST_RXQ_STATS;
#undef VHOST_RXQ_STAT
free(vhost_stats_names);
vhost_stats_names = NULL;
free(vhost_stats);
vhost_stats = NULL;
/* We expect all txqs have the same number of stats, only query txq0. */
qid = 0 * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
err = EPROTO;
goto out;
}
vhost_stats_count = err;
vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats);
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
#define VHOST_TXQ_STATS \
VHOST_TXQ_STAT(tx_packets, "good_packets") \
VHOST_TXQ_STAT(tx_bytes, "good_bytes") \
VHOST_TXQ_STAT(tx_broadcast_packets, "broadcast_packets") \
VHOST_TXQ_STAT(tx_multicast_packets, "multicast_packets") \
VHOST_TXQ_STAT(rx_undersized_errors, "undersize_packets") \
VHOST_TXQ_STAT(tx_1_to_64_packets, "size_64_packets") \
VHOST_TXQ_STAT(tx_65_to_127_packets, "size_65_127_packets") \
VHOST_TXQ_STAT(tx_128_to_255_packets, "size_128_255_packets") \
VHOST_TXQ_STAT(tx_256_to_511_packets, "size_256_511_packets") \
VHOST_TXQ_STAT(tx_512_to_1023_packets, "size_512_1023_packets") \
VHOST_TXQ_STAT(tx_1024_to_1522_packets, "size_1024_1518_packets") \
VHOST_TXQ_STAT(tx_1523_to_max_packets, "size_1519_max_packets")
#define VHOST_TXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0;
VHOST_TXQ_STATS;
#undef VHOST_TXQ_STAT
for (int q = 0; q < dev->up.n_txq; q++) {
qid = q * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
for (int i = 0; i < vhost_stats_count; i++) {
#define VHOST_TXQ_STAT(MEMBER, NAME) \
if (string_ends_with(vhost_stats_names[i].name, NAME)) { \
dev->stats.MEMBER += vhost_stats[i].value; \
continue; \
}
VHOST_TXQ_STATS;
#undef VHOST_TXQ_STAT
}
}
/* OVS reports 64 bytes and smaller packets into "tx_1_to_64_packets".
* Same as for rx, rx_undersized_errors is highjacked. */
dev->stats.tx_1_to_64_packets += dev->stats.rx_undersized_errors;
memset(&dev->stats.rx_undersized_errors, 0xff,
sizeof dev->stats.rx_undersized_errors);
#define VHOST_TXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER;
VHOST_TXQ_STATS;
#undef VHOST_TXQ_STAT
rte_spinlock_lock(&dev->stats_lock);
stats->rx_dropped = dev->stats.rx_dropped;
stats->tx_dropped = dev->stats.tx_dropped;
rte_spinlock_unlock(&dev->stats_lock);
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
err = 0;
out:
ovs_mutex_unlock(&dev->mutex);
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
free(vhost_stats);
free(vhost_stats_names);
return err;
}
static int
netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats)
{
struct rte_vhost_stat_name *vhost_stats_names = NULL;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_vhost_stat *vhost_stats = NULL;
int vhost_rxq_stats_count;
int vhost_txq_stats_count;
int stat_offset;
int err;
int qid;
int vid;
netdev_dpdk_get_sw_custom_stats(netdev, custom_stats);
stat_offset = custom_stats->size;
ovs_mutex_lock(&dev->mutex);
if (!is_vhost_running(dev)) {
goto out;
}
vid = netdev_dpdk_get_vid(dev);
qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
goto out;
}
vhost_rxq_stats_count = err;
qid = 0 * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
goto out;
}
vhost_txq_stats_count = err;
stat_offset += dev->up.n_rxq * vhost_rxq_stats_count;
stat_offset += dev->up.n_txq * vhost_txq_stats_count;
custom_stats->counters = xrealloc(custom_stats->counters,
stat_offset *
sizeof *custom_stats->counters);
stat_offset = custom_stats->size;
vhost_stats_names = xcalloc(vhost_rxq_stats_count,
sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_rxq_stats_count, sizeof *vhost_stats);
for (int q = 0; q < dev->up.n_rxq; q++) {
qid = q * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_rxq_stats_count);
if (err != vhost_rxq_stats_count) {
goto out;
}
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_rxq_stats_count);
if (err != vhost_rxq_stats_count) {
goto out;
}
for (int i = 0; i < vhost_rxq_stats_count; i++) {
ovs_strlcpy(custom_stats->counters[stat_offset + i].name,
vhost_stats_names[i].name,
NETDEV_CUSTOM_STATS_NAME_SIZE);
custom_stats->counters[stat_offset + i].value =
vhost_stats[i].value;
}
stat_offset += vhost_rxq_stats_count;
}
free(vhost_stats_names);
vhost_stats_names = NULL;
free(vhost_stats);
vhost_stats = NULL;
vhost_stats_names = xcalloc(vhost_txq_stats_count,
sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_txq_stats_count, sizeof *vhost_stats);
for (int q = 0; q < dev->up.n_txq; q++) {
qid = q * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_txq_stats_count);
if (err != vhost_txq_stats_count) {
goto out;
}
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_txq_stats_count);
if (err != vhost_txq_stats_count) {
goto out;
}
for (int i = 0; i < vhost_txq_stats_count; i++) {
ovs_strlcpy(custom_stats->counters[stat_offset + i].name,
vhost_stats_names[i].name,
NETDEV_CUSTOM_STATS_NAME_SIZE);
custom_stats->counters[stat_offset + i].value =
vhost_stats[i].value;
}
stat_offset += vhost_txq_stats_count;
}
free(vhost_stats_names);
vhost_stats_names = NULL;
free(vhost_stats);
vhost_stats = NULL;
out:
ovs_mutex_unlock(&dev->mutex);
custom_stats->size = stat_offset;
return 0;
}
static void
netdev_dpdk_convert_xstats(struct netdev_stats *stats,
const struct rte_eth_xstat *xstats,
const struct rte_eth_xstat_name *names,
const unsigned int size)
{
/* DPDK XSTATS Counter names definition. */
#define DPDK_XSTATS \
DPDK_XSTAT(multicast, "rx_multicast_packets" ) \
DPDK_XSTAT(tx_multicast_packets, "tx_multicast_packets" ) \
DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
DPDK_XSTAT(rx_1_to_64_packets, "rx_size_64_packets" ) \
DPDK_XSTAT(rx_65_to_127_packets, "rx_size_65_to_127_packets" ) \
DPDK_XSTAT(rx_128_to_255_packets, "rx_size_128_to_255_packets" ) \
DPDK_XSTAT(rx_256_to_511_packets, "rx_size_256_to_511_packets" ) \
DPDK_XSTAT(rx_512_to_1023_packets, "rx_size_512_to_1023_packets" ) \
DPDK_XSTAT(rx_1024_to_1522_packets, "rx_size_1024_to_1522_packets" ) \
DPDK_XSTAT(rx_1523_to_max_packets, "rx_size_1523_to_max_packets" ) \
DPDK_XSTAT(tx_1_to_64_packets, "tx_size_64_packets" ) \
DPDK_XSTAT(tx_65_to_127_packets, "tx_size_65_to_127_packets" ) \
DPDK_XSTAT(tx_128_to_255_packets, "tx_size_128_to_255_packets" ) \
DPDK_XSTAT(tx_256_to_511_packets, "tx_size_256_to_511_packets" ) \
DPDK_XSTAT(tx_512_to_1023_packets, "tx_size_512_to_1023_packets" ) \
DPDK_XSTAT(tx_1024_to_1522_packets, "tx_size_1024_to_1522_packets" ) \
DPDK_XSTAT(tx_1523_to_max_packets, "tx_size_1523_to_max_packets" )
for (unsigned int i = 0; i < size; i++) {
#define DPDK_XSTAT(MEMBER, NAME) \
if (strcmp(NAME, names[i].name) == 0) { \
stats->MEMBER = xstats[i].value; \
continue; \
}
DPDK_XSTATS;
#undef DPDK_XSTAT
}
#undef DPDK_XSTATS
}
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
static int
netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_stats rte_stats;
bool gg;
netdev_dpdk_get_carrier(netdev, &gg);
ovs_mutex_lock(&dev->mutex);
struct rte_eth_xstat *rte_xstats = NULL;
struct rte_eth_xstat_name *rte_xstats_names = NULL;
int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
dev->port_id);
ovs_mutex_unlock(&dev->mutex);
return EPROTO;
}
/* Get length of statistics */
rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
if (rte_xstats_len < 0) {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
/* Reserve memory for xstats names and values */
rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
/* Retreive xstats names */
rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
rte_xstats_names,
rte_xstats_len);
if (rte_xstats_new_len != rte_xstats_len) {
VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
/* Retreive xstats values */
memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
rte_xstats_len);
if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
rte_xstats_len);
} else {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
}
out:
free(rte_xstats);
free(rte_xstats_names);
stats->rx_packets = rte_stats.ipackets;
stats->tx_packets = rte_stats.opackets;
stats->rx_bytes = rte_stats.ibytes;
stats->tx_bytes = rte_stats.obytes;
stats->rx_errors = rte_stats.ierrors;
stats->tx_errors = rte_stats.oerrors;
rte_spinlock_lock(&dev->stats_lock);
stats->tx_dropped = dev->stats.tx_dropped;
stats->rx_dropped = dev->stats.rx_dropped;
rte_spinlock_unlock(&dev->stats_lock);
/* These are the available DPDK counters for packets not received due to
* local resource constraints in DPDK and NIC respectively. */
stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
stats->rx_missed_errors = rte_stats.imissed;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_custom_stats(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats)
{
uint32_t i;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int rte_xstats_ret, sw_stats_size;
netdev_dpdk_get_sw_custom_stats(netdev, custom_stats);
ovs_mutex_lock(&dev->mutex);
if (dev->rte_xstats_ids_size > 0) {
uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
sizeof(uint64_t));
rte_xstats_ret =
rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
values, dev->rte_xstats_ids_size);
if (rte_xstats_ret > 0 &&
rte_xstats_ret <= dev->rte_xstats_ids_size) {
sw_stats_size = custom_stats->size;
custom_stats->size += rte_xstats_ret;
custom_stats->counters = xrealloc(custom_stats->counters,
custom_stats->size *
sizeof *custom_stats->counters);
for (i = 0; i < rte_xstats_ret; i++) {
ovs_strlcpy(custom_stats->counters[sw_stats_size + i].name,
netdev_dpdk_get_xstat_name(dev,
dev->rte_xstats_ids[i]),
NETDEV_CUSTOM_STATS_NAME_SIZE);
custom_stats->counters[sw_stats_size + i].value = values[i];
}
} else {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
}
free(values);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_sw_custom_stats(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int i, n;
#define SW_CSTATS \
SW_CSTAT(tx_retries) \
SW_CSTAT(tx_failure_drops) \
SW_CSTAT(tx_mtu_exceeded_drops) \
SW_CSTAT(tx_qos_drops) \
SW_CSTAT(rx_qos_drops) \
SW_CSTAT(tx_invalid_hwol_drops)
#define SW_CSTAT(NAME) + 1
custom_stats->size = SW_CSTATS;
#undef SW_CSTAT
custom_stats->counters = xcalloc(custom_stats->size,
sizeof *custom_stats->counters);
ovs_mutex_lock(&dev->mutex);
rte_spinlock_lock(&dev->stats_lock);
i = 0;
#define SW_CSTAT(NAME) \
custom_stats->counters[i++].value = dev->sw_stats->NAME;
SW_CSTATS;
#undef SW_CSTAT
rte_spinlock_unlock(&dev->stats_lock);
ovs_mutex_unlock(&dev->mutex);
i = 0;
n = 0;
#define SW_CSTAT(NAME) \
if (custom_stats->counters[i].value != UINT64_MAX) { \
ovs_strlcpy(custom_stats->counters[n].name, \
"ovs_"#NAME, NETDEV_CUSTOM_STATS_NAME_SIZE); \
custom_stats->counters[n].value = custom_stats->counters[i].value; \
n++; \
} \
i++;
SW_CSTATS;
#undef SW_CSTAT
custom_stats->size = n;
return 0;
}
static int
netdev_dpdk_get_features(const struct netdev *netdev,
enum netdev_features *current,
enum netdev_features *advertised,
enum netdev_features *supported,
enum netdev_features *peer)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_link link;
uint32_t feature = 0;
ovs_mutex_lock(&dev->mutex);
link = dev->link;
ovs_mutex_unlock(&dev->mutex);
/* Match against OpenFlow defined link speed values. */
if (link.link_duplex == RTE_ETH_LINK_FULL_DUPLEX) {
switch (link.link_speed) {
case RTE_ETH_SPEED_NUM_10M:
feature |= NETDEV_F_10MB_FD;
break;
case RTE_ETH_SPEED_NUM_100M:
feature |= NETDEV_F_100MB_FD;
break;
case RTE_ETH_SPEED_NUM_1G:
feature |= NETDEV_F_1GB_FD;
break;
case RTE_ETH_SPEED_NUM_10G:
feature |= NETDEV_F_10GB_FD;
break;
case RTE_ETH_SPEED_NUM_40G:
feature |= NETDEV_F_40GB_FD;
break;
case RTE_ETH_SPEED_NUM_100G:
feature |= NETDEV_F_100GB_FD;
break;
default:
feature |= NETDEV_F_OTHER;
}
} else if (link.link_duplex == RTE_ETH_LINK_HALF_DUPLEX) {
switch (link.link_speed) {
case RTE_ETH_SPEED_NUM_10M:
feature |= NETDEV_F_10MB_HD;
break;
case RTE_ETH_SPEED_NUM_100M:
feature |= NETDEV_F_100MB_HD;
break;
case RTE_ETH_SPEED_NUM_1G:
feature |= NETDEV_F_1GB_HD;
break;
default:
feature |= NETDEV_F_OTHER;
}
}
if (link.link_autoneg) {
feature |= NETDEV_F_AUTONEG;
}
*current = feature;
*advertised = *supported = *peer = 0;
return 0;
}
netdev: Add netdev_get_speed() to netdev API. Currently, the netdev's speed is being calculated by taking the link's feature bits (using netdev_get_features()) and transforming them into bps. This mechanism can be both inaccurate and difficult to maintain, mainly because we currently use the feature bits supported by OpenFlow which would have to be extended to support all new feature bits of all netdev implementations while keeping the OpenFlow API intact. In order to expose the link speed accurately for all current and future hardware, add a new netdev API call that allows the implementations to provide the current and maximum link speeds in Mbps. Internally, the logic to get the maximum supported speed still relies on feature bits so it might still get out of sync in the future. However, the maximum configurable speed is not used as much as the current speed and these feature bits are not exposed through the netdev interface so it should be easier to add more. Use this new function instead of netdev_get_features() where the link speed is needed. As a consequence of this patch, link speeds of cards is properly reported (internally in OVSDB) even if not supported by OpenFlow. A test verifies this behavior using a tap device. Also, in order to avoid using the old, this patch adds a checkpatch.py warning if the old API is used. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2137567 Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Adrian Moreno <amorenoz@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-17 10:08:11 +02:00
static int
netdev_dpdk_get_speed(const struct netdev *netdev, uint32_t *current,
uint32_t *max)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
struct rte_eth_link link;
ovs_mutex_lock(&dev->mutex);
link = dev->link;
rte_eth_dev_info_get(dev->port_id, &dev_info);
ovs_mutex_unlock(&dev->mutex);
*current = link.link_speed != RTE_ETH_SPEED_NUM_UNKNOWN
? link.link_speed : 0;
if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_200G) {
*max = RTE_ETH_SPEED_NUM_200G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100G) {
*max = RTE_ETH_SPEED_NUM_100G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_56G) {
*max = RTE_ETH_SPEED_NUM_56G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_50G) {
*max = RTE_ETH_SPEED_NUM_50G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_40G) {
*max = RTE_ETH_SPEED_NUM_40G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_25G) {
*max = RTE_ETH_SPEED_NUM_25G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_20G) {
*max = RTE_ETH_SPEED_NUM_20G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10G) {
*max = RTE_ETH_SPEED_NUM_10G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_5G) {
*max = RTE_ETH_SPEED_NUM_5G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_2_5G) {
*max = RTE_ETH_SPEED_NUM_2_5G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_1G) {
*max = RTE_ETH_SPEED_NUM_1G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M ||
dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M_HD) {
*max = RTE_ETH_SPEED_NUM_100M;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M ||
dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M_HD) {
*max = RTE_ETH_SPEED_NUM_10M;
} else {
*max = 0;
}
return 0;
}
static struct ingress_policer *
netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
{
struct ingress_policer *policer = NULL;
uint64_t rate_bytes;
uint64_t burst_bytes;
int err = 0;
policer = xmalloc(sizeof *policer);
rte_spinlock_init(&policer->policer_lock);
/* rte_meter requires bytes so convert kbits rate and burst to bytes. */
rate_bytes = rate * 1000ULL / 8;
burst_bytes = burst * 1000ULL / 8;
policer->app_srtcm_params.cir = rate_bytes;
policer->app_srtcm_params.cbs = burst_bytes;
policer->app_srtcm_params.ebs = 0;
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
err = rte_meter_srtcm_profile_config(&policer->in_prof,
&policer->app_srtcm_params);
if (!err) {
err = rte_meter_srtcm_config(&policer->in_policer,
&policer->in_prof);
}
if (err) {
VLOG_ERR("Could not create rte meter for ingress policer");
free(policer);
return NULL;
}
return policer;
}
static int
netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
uint32_t policer_burst,
uint32_t policer_kpkts_rate OVS_UNUSED,
uint32_t policer_kpkts_burst OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct ingress_policer *policer;
/* Force to 0 if no rate specified,
* default to 8000 kbits if burst is 0,
* else stick with user-specified value.
*/
policer_burst = (!policer_rate ? 0
: !policer_burst ? 8000
: policer_burst);
ovs_mutex_lock(&dev->mutex);
policer = ovsrcu_get_protected(struct ingress_policer *,
&dev->ingress_policer);
if (dev->policer_rate == policer_rate &&
dev->policer_burst == policer_burst) {
/* Assume that settings haven't changed since we last set them. */
ovs_mutex_unlock(&dev->mutex);
return 0;
}
/* Destroy any existing ingress policer for the device if one exists */
if (policer) {
ovsrcu_postpone(free, policer);
}
if (policer_rate != 0) {
policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
} else {
policer = NULL;
}
ovsrcu_set(&dev->ingress_policer, policer);
dev->policer_rate = policer_rate;
dev->policer_burst = policer_burst;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_ifindex(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
/* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
* postive integer to meet RFC 2863 recommendations.
*/
int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
ovs_mutex_unlock(&dev->mutex);
return ifindex;
}
static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
check_link_status(dev);
*carrier = dev->link.link_status;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (is_vhost_running(dev)) {
*carrier = 1;
} else {
*carrier = 0;
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static long long int
netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
long long int carrier_resets;
ovs_mutex_lock(&dev->mutex);
carrier_resets = dev->link_reset_cnt;
ovs_mutex_unlock(&dev->mutex);
return carrier_resets;
}
static int
netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
long long int interval OVS_UNUSED)
{
return EOPNOTSUPP;
}
static int
netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
enum netdev_flags off, enum netdev_flags on,
enum netdev_flags *old_flagsp)
OVS_REQUIRES(dev->mutex)
{
if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
return EINVAL;
}
*old_flagsp = dev->flags;
dev->flags |= on;
dev->flags &= ~off;
if (dev->flags == *old_flagsp) {
return 0;
}
if (dev->type == DPDK_DEV_ETH) {
if ((dev->flags ^ *old_flagsp) & NETDEV_UP) {
int err;
if (dev->flags & NETDEV_UP) {
err = rte_eth_dev_set_link_up(dev->port_id);
} else {
err = rte_eth_dev_set_link_down(dev->port_id);
}
if (err == -ENOTSUP) {
VLOG_INFO("Interface %s does not support link state "
"configuration", netdev_get_name(&dev->up));
} else if (err < 0) {
VLOG_ERR("Interface %s link change error: %s",
netdev_get_name(&dev->up), rte_strerror(-err));
dev->flags = *old_flagsp;
return -err;
}
}
if (dev->flags & NETDEV_PROMISC) {
rte_eth_promiscuous_enable(dev->port_id);
}
netdev_change_seq_changed(&dev->up);
} else {
/* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
* running then change netdev's change_seq to trigger link state
* update. */
if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
&& is_vhost_running(dev)) {
netdev_change_seq_changed(&dev->up);
/* Clear statistics if device is getting up. */
if (NETDEV_UP & on) {
rte_spinlock_lock(&dev->stats_lock);
memset(&dev->stats, 0, sizeof dev->stats);
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
memset(dev->sw_stats, 0, sizeof *dev->sw_stats);
rte_spinlock_unlock(&dev->stats_lock);
}
}
}
return 0;
}
static int
netdev_dpdk_update_flags(struct netdev *netdev,
enum netdev_flags off, enum netdev_flags on,
enum netdev_flags *old_flagsp)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int error;
ovs_mutex_lock(&dev->mutex);
error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
int vid = netdev_dpdk_get_vid(dev);
if (vid < 0) {
smap_add_format(args, "status", "disconnected");
ovs_mutex_unlock(&dev->mutex);
return 0;
} else {
smap_add_format(args, "status", "connected");
}
char socket_name[PATH_MAX];
if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
smap_add_format(args, "socket", "%s", socket_name);
}
uint64_t features;
if (!rte_vhost_get_negotiated_features(vid, &features)) {
smap_add_format(args, "features", "0x%016"PRIx64, features);
}
uint16_t mtu;
if (!rte_vhost_get_mtu(vid, &mtu)) {
smap_add_format(args, "mtu", "%d", mtu);
}
int numa = rte_vhost_get_numa_node(vid);
if (numa >= 0) {
smap_add_format(args, "numa", "%d", numa);
}
uint16_t vring_num = rte_vhost_get_vring_num(vid);
if (vring_num) {
smap_add_format(args, "num_of_vrings", "%d", vring_num);
}
for (int i = 0; i < vring_num; i++) {
struct rte_vhost_vring vring;
rte_vhost_get_vhost_vring(vid, i, &vring);
smap_add_nocopy(args, xasprintf("vring_%d_size", i),
xasprintf("%d", vring.size));
}
if (userspace_tso_enabled()
&& dev->virtio_features_state & OVS_VIRTIO_F_WORKAROUND) {
smap_add_format(args, "userspace-tso", "disabled");
}
smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
smap_add_format(args, "n_txq", "%d", netdev->n_txq);
ovs_mutex_unlock(&dev->mutex);
return 0;
}
/*
* Convert a given uint32_t link speed defined in DPDK to a string
* equivalent.
*/
static const char *
netdev_dpdk_link_speed_to_str__(uint32_t link_speed)
{
switch (link_speed) {
case RTE_ETH_SPEED_NUM_10M: return "10Mbps";
case RTE_ETH_SPEED_NUM_100M: return "100Mbps";
case RTE_ETH_SPEED_NUM_1G: return "1Gbps";
case RTE_ETH_SPEED_NUM_2_5G: return "2.5Gbps";
case RTE_ETH_SPEED_NUM_5G: return "5Gbps";
case RTE_ETH_SPEED_NUM_10G: return "10Gbps";
case RTE_ETH_SPEED_NUM_20G: return "20Gbps";
case RTE_ETH_SPEED_NUM_25G: return "25Gbps";
case RTE_ETH_SPEED_NUM_40G: return "40Gbps";
case RTE_ETH_SPEED_NUM_50G: return "50Gbps";
case RTE_ETH_SPEED_NUM_56G: return "56Gbps";
case RTE_ETH_SPEED_NUM_100G: return "100Gbps";
default: return "Not Defined";
}
}
static int
netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
size_t rx_steer_flows_num;
uint64_t rx_steer_flags;
const char *bus_info;
uint32_t link_speed;
uint32_t dev_flags;
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
int n_rxq;
if (!rte_eth_dev_is_valid_port(dev->port_id)) {
return ENODEV;
}
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
rte_eth_dev_info_get(dev->port_id, &dev_info);
link_speed = dev->link.link_speed;
dev_flags = *dev_info.dev_flags;
bus_info = rte_dev_bus_info(dev_info.device);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
rx_steer_flags = dev->rx_steer_flags;
rx_steer_flows_num = dev->rx_steer_flows_num;
n_rxq = netdev->n_rxq;
ovs_mutex_unlock(&dev->mutex);
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
ovs_mutex_unlock(&dpdk_mutex);
smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
smap_add_format(args, "numa_id", "%d",
rte_eth_dev_socket_id(dev->port_id));
smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
smap_add_format(args, "max_hash_mac_addrs", "%u",
dev_info.max_hash_mac_addrs);
smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
smap_add_format(args, "n_txq", "%d", netdev->n_txq);
smap_add(args, "rx_csum_offload",
dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD
? "true" : "false");
/* Querying the DPDK library for iftype may be done in future, pending
* support; cf. RFC 3635 Section 3.2.4. */
enum { IF_TYPE_ETHERNETCSMACD = 6 };
smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
smap_add_format(args, "if_descr", "%s %s", rte_version(),
dev_info.driver_name);
smap_add_format(args, "bus_info", "bus_name=%s%s%s",
rte_bus_name(rte_dev_bus(dev_info.device)),
bus_info != NULL ? ", " : "",
bus_info != NULL ? bus_info : "");
/* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
* In that case the speed will not be reported as part of the usual
* call to get_features(). Get the link speed of the device and add it
* to the device status in an easy to read string format.
*/
smap_add(args, "link_speed",
netdev_dpdk_link_speed_to_str__(link_speed));
if (dev_flags & RTE_ETH_DEV_REPRESENTOR) {
smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT,
ETH_ADDR_ARGS(dev->hwaddr));
}
if (rx_steer_flags && !rx_steer_flows_num) {
smap_add(args, "rx-steering", "unsupported");
} else if (rx_steer_flags == DPDK_RX_STEER_LACP) {
smap_add(args, "rx-steering", "rss+lacp");
} else {
ovs_assert(!rx_steer_flags);
smap_add(args, "rx-steering", "rss");
}
if (rx_steer_flags && rx_steer_flows_num) {
smap_add_format(args, "rx_steering_queue", "%d", n_rxq - 1);
if (n_rxq > 2) {
smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
} else {
smap_add(args, "rss_queues", "0");
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
}
}
return 0;
}
static void
netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
OVS_REQUIRES(dev->mutex)
{
enum netdev_flags old_flags;
if (admin_state) {
netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
} else {
netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
}
}
static void
netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
const char *argv[], void *aux OVS_UNUSED)
{
bool up;
if (!strcasecmp(argv[argc - 1], "up")) {
up = true;
} else if ( !strcasecmp(argv[argc - 1], "down")) {
up = false;
} else {
unixctl_command_reply_error(conn, "Invalid Admin State");
return;
}
if (argc > 2) {
struct netdev *netdev = netdev_from_name(argv[1]);
if (netdev && is_dpdk_class(netdev->netdev_class)) {
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
netdev_dpdk_set_admin_state__(dev, up);
ovs_mutex_unlock(&dev->mutex);
netdev_close(netdev);
} else {
unixctl_command_reply_error(conn, "Not a DPDK Interface");
netdev_close(netdev);
return;
}
} else {
struct netdev_dpdk *dev;
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
netdev_dpdk_set_admin_state__(dev, up);
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
}
unixctl_command_reply(conn, "OK");
}
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
static void
netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
const char *argv[], void *aux OVS_UNUSED)
{
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
struct ds used_interfaces = DS_EMPTY_INITIALIZER;
struct rte_eth_dev_info dev_info;
dpdk_port_t sibling_port_id;
dpdk_port_t port_id;
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
bool used = false;
char *response;
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
ovs_mutex_lock(&dpdk_mutex);
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
port_id = netdev_dpdk_get_port_by_devargs(argv[1]);
if (!rte_eth_dev_is_valid_port(port_id)) {
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
response = xasprintf("Device '%s' not found in DPDK", argv[1]);
goto error;
}
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
ds_put_format(&used_interfaces,
"Device '%s' is being used by the following interfaces:",
argv[1]);
RTE_ETH_FOREACH_DEV_SIBLING (sibling_port_id, port_id) {
struct netdev_dpdk *dev;
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (dev->port_id != sibling_port_id) {
continue;
}
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
used = true;
ds_put_format(&used_interfaces, " %s",
netdev_get_name(&dev->up));
break;
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
}
}
if (used) {
ds_put_cstr(&used_interfaces, ". Remove them before detaching.");
response = ds_steal_cstr(&used_interfaces);
ds_destroy(&used_interfaces);
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
goto error;
}
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
ds_destroy(&used_interfaces);
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
rte_eth_dev_info_get(port_id, &dev_info);
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
rte_eth_dev_close(port_id);
if (rte_dev_remove(dev_info.device) < 0) {
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
response = xasprintf("Device '%s' can not be detached", argv[1]);
goto error;
}
netdev-dpdk: support port representors Dpdk port representors were introduced in dpdk versions 18.xx. Prior to port representors there was a one-to-one relationship between an rte device (e.g. PCI bus) and an eth device (referenced as dpdk port id in OVS). With port representors the relationship becomes one-to-many rte device to eth devices. For example in [3] there are two devices (representors) using the same PCI physical address 0000:08:00.0: "0000:08:00.0,representor=[3]" and "0000:08:00.0,representor=[5]". This commit handles the new one-to-many relationship. For example, when one of the device port representors in [3] is closed - the PCI bus cannot be detached until the other device port representor is closed as well. OVS remains backward compatible by supporting dpdk legacy PCI ports which do not include port representors. Dpdk port representors related commits are listed in [1]. Dpdk port representors documentation appears in [2]. A sample configuration which uses two representors ports (the output of "ovs-vsctl show" command) is shown in [3]. [1] e0cb96204b71 ("net/i40e: add support for representor ports") cf80ba6e2038 ("net/ixgbe: add support for representor ports") 26c08b979d26 ("net/mlx5: add port representor awareness") [2] https://doc.dpdk.org/guides-18.11/prog_guide/switch_representation.html [3] Bridge "ovs_br0" Port "ovs_br0" Interface "ovs_br0" type: internal Port "port-rep3" Interface "port-rep3" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[3]"} Port "port-rep5" Interface "port-rep5" type: dpdk options: {dpdk-devargs="0000:08:00.0,representor=[5]"} ovs_version: "2.10.90" Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-01-17 18:42:35 +00:00
response = xasprintf("All devices shared with device '%s' "
"have been detached", argv[1]);
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
ovs_mutex_unlock(&dpdk_mutex);
unixctl_command_reply(conn, response);
free(response);
return;
error:
ovs_mutex_unlock(&dpdk_mutex);
unixctl_command_reply_error(conn, response);
free(response);
}
static void
netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
int argc, const char *argv[],
void *aux OVS_UNUSED)
{
size_t size;
FILE *stream;
char *response = NULL;
struct netdev *netdev = NULL;
if (argc == 2) {
netdev = netdev_from_name(argv[1]);
if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
unixctl_command_reply_error(conn, "Not a DPDK Interface");
goto out;
}
}
stream = open_memstream(&response, &size);
if (!stream) {
response = xasprintf("Unable to open memstream: %s.",
ovs_strerror(errno));
unixctl_command_reply_error(conn, response);
goto out;
}
if (netdev) {
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ovs_mutex_lock(&dpdk_mp_mutex);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
rte_mempool_dump(stream, dev->dpdk_mp->mp);
fprintf(stream, " count: avail (%u), in use (%u)\n",
rte_mempool_avail_count(dev->dpdk_mp->mp),
rte_mempool_in_use_count(dev->dpdk_mp->mp));
ovs_mutex_unlock(&dpdk_mp_mutex);
ovs_mutex_unlock(&dev->mutex);
} else {
ovs_mutex_lock(&dpdk_mp_mutex);
rte_mempool_list_dump(stream);
ovs_mutex_unlock(&dpdk_mp_mutex);
}
fclose(stream);
unixctl_command_reply(conn, response);
out:
free(response);
netdev_close(netdev);
}
/*
* Set virtqueue flags so that we do not receive interrupts.
*/
static void
set_irq_status(int vid)
{
uint32_t i;
for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
rte_vhost_enable_guest_notification(vid, i, 0);
}
}
/*
* Fixes mapping for vhost-user tx queues. Must be called after each
* enabling/disabling of queues and n_txq modifications.
*/
static void
netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
int *enabled_queues, n_enabled = 0;
int i, k, total_txqs = dev->up.n_txq;
enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
for (i = 0; i < total_txqs; i++) {
/* Enabled queues always mapped to themselves. */
if (dev->tx_q[i].map == i) {
enabled_queues[n_enabled++] = i;
}
}
if (n_enabled == 0 && total_txqs != 0) {
enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
n_enabled = 1;
}
k = 0;
for (i = 0; i < total_txqs; i++) {
if (dev->tx_q[i].map != i) {
dev->tx_q[i].map = enabled_queues[k];
k = (k + 1) % n_enabled;
}
}
if (VLOG_IS_DBG_ENABLED()) {
struct ds mapping = DS_EMPTY_INITIALIZER;
ds_put_format(&mapping, "TX queue mapping for port '%s':\n",
netdev_get_name(&dev->up));
for (i = 0; i < total_txqs; i++) {
ds_put_format(&mapping, "%2d --> %2d\n", i, dev->tx_q[i].map);
}
VLOG_DBG("%s", ds_cstr(&mapping));
ds_destroy(&mapping);
}
free(enabled_queues);
}
/*
* A new virtio-net device is added to a vhost port.
*/
static int
new_device(int vid)
{
struct netdev_dpdk *dev;
bool exists = false;
int newnode = 0;
char ifname[IF_NAME_SZ];
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
/* Add device to the vhost port with the same name as that passed down. */
LIST_FOR_EACH(dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (nullable_string_is_equal(ifname, dev->vhost_id)) {
uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
uint64_t features;
/* Get NUMA information */
newnode = rte_vhost_get_numa_node(vid);
if (newnode == -1) {
#ifdef VHOST_NUMA
VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
ifname);
#endif
newnode = dev->socket_id;
}
dev->virtio_features_state |= OVS_VIRTIO_F_NEGOTIATED;
if (dev->requested_n_txq < qp_num
|| dev->requested_n_rxq < qp_num
|| dev->requested_socket_id != newnode
|| dev->dpdk_mp == NULL) {
dev->requested_socket_id = newnode;
dev->requested_n_rxq = qp_num;
dev->requested_n_txq = qp_num;
netdev_request_reconfigure(&dev->up);
} else {
/* Reconfiguration not required. */
dev->vhost_reconfigured = true;
}
if (rte_vhost_get_negotiated_features(vid, &features)) {
VLOG_INFO("Error checking guest features for "
"vHost Device '%s'", dev->vhost_id);
} else {
if (features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) {
dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD;
dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD;
dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD;
}
if (userspace_tso_enabled()
&& dev->virtio_features_state & OVS_VIRTIO_F_CLEAN) {
if (features & (1ULL << VIRTIO_NET_F_GUEST_TSO4)
&& features & (1ULL << VIRTIO_NET_F_GUEST_TSO6)) {
dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
VLOG_DBG("%s: TSO enabled on vhost port",
netdev_get_name(&dev->up));
} else {
VLOG_WARN("%s: Tx TSO offload is not supported.",
netdev_get_name(&dev->up));
}
}
}
/* There is no support in virtio net to offload IPv4 csum,
* but the vhost library handles IPv4 csum offloading fine. */
dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
netdev_dpdk_update_netdev_flags(dev);
ovsrcu_index_set(&dev->vid, vid);
exists = true;
/* Disable notifications. */
set_irq_status(vid);
netdev_change_seq_changed(&dev->up);
ovs_mutex_unlock(&dev->mutex);
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (!exists) {
VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
return -1;
}
VLOG_INFO("vHost Device '%s' has been added on numa node %i",
ifname, newnode);
return 0;
}
/* Clears mapping for all available queues of vhost interface. */
static void
netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
int i;
for (i = 0; i < dev->up.n_txq; i++) {
dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
}
}
/*
* Remove a virtio-net device from the specific vhost port. Use dev->remove
* flag to stop any more packets from being sent or received to/from a VM and
* ensure all currently queued packets have been sent/received before removing
* the device.
*/
static void
destroy_device(int vid)
{
struct netdev_dpdk *dev;
bool exists = false;
char ifname[IF_NAME_SZ];
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (netdev_dpdk_get_vid(dev) == vid) {
ovs_mutex_lock(&dev->mutex);
dev->vhost_reconfigured = false;
ovsrcu_index_set(&dev->vid, -1);
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
memset(dev->vhost_rxq_enabled, 0,
dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
netdev_dpdk_txq_map_clear(dev);
/* Clear offload capabilities before next new_device. */
dev->hw_ol_features = 0;
netdev_dpdk_update_netdev_flags(dev);
netdev_change_seq_changed(&dev->up);
ovs_mutex_unlock(&dev->mutex);
exists = true;
break;
}
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
/*
* Wait for other threads to quiesce after setting the 'virtio_dev'
* to NULL, before returning.
*/
ovsrcu_synchronize();
/*
* As call to ovsrcu_synchronize() will end the quiescent state,
* put thread back into quiescent state before returning.
*/
ovsrcu_quiesce_start();
VLOG_INFO("vHost Device '%s' has been removed", ifname);
} else {
VLOG_INFO("vHost Device '%s' not found", ifname);
}
}
static struct mpsc_queue vhost_state_change_queue
= MPSC_QUEUE_INITIALIZER(&vhost_state_change_queue);
static atomic_uint64_t vhost_state_change_queue_size;
struct vhost_state_change {
struct mpsc_queue_node node;
char ifname[IF_NAME_SZ];
uint16_t queue_id;
int enable;
};
static void
vring_state_changed__(struct vhost_state_change *sc)
{
struct netdev_dpdk *dev;
bool exists = false;
int qid = sc->queue_id / VIRTIO_QNUM;
bool is_rx = (sc->queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (nullable_string_is_equal(sc->ifname, dev->vhost_id)) {
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
if (is_rx) {
bool old_state = dev->vhost_rxq_enabled[qid];
dev->vhost_rxq_enabled[qid] = sc->enable != 0;
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
if (old_state != dev->vhost_rxq_enabled[qid]) {
netdev_change_seq_changed(&dev->up);
}
} else {
if (sc->enable) {
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
dev->tx_q[qid].map = qid;
} else {
dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
}
netdev_dpdk_remap_txqs(dev);
}
exists = true;
ovs_mutex_unlock(&dev->mutex);
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
"changed to \'%s\'", sc->queue_id, is_rx ? "rx" : "tx",
qid, sc->ifname, sc->enable == 1 ? "enabled" : "disabled");
} else {
VLOG_INFO("vHost Device '%s' not found", sc->ifname);
}
}
#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN 1
#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX 64
static void *
netdev_dpdk_vhost_events_main(void *arg OVS_UNUSED)
{
mpsc_queue_acquire(&vhost_state_change_queue);
for (;;) {
struct mpsc_queue_node *node;
uint64_t backoff;
backoff = NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN;
while (mpsc_queue_tail(&vhost_state_change_queue) == NULL) {
xnanosleep(backoff * 1E6);
if (backoff < NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX) {
backoff <<= 1;
}
}
MPSC_QUEUE_FOR_EACH_POP (node, &vhost_state_change_queue) {
struct vhost_state_change *sc;
sc = CONTAINER_OF(node, struct vhost_state_change, node);
vring_state_changed__(sc);
free(sc);
atomic_count_dec64(&vhost_state_change_queue_size);
}
}
OVS_NOT_REACHED();
mpsc_queue_release(&vhost_state_change_queue);
return NULL;
}
static int
vring_state_changed(int vid, uint16_t queue_id, int enable)
{
static struct vlog_rate_limit vhost_rl = VLOG_RATE_LIMIT_INIT(5, 5);
struct vhost_state_change *sc;
sc = xmalloc(sizeof *sc);
if (!rte_vhost_get_ifname(vid, sc->ifname, sizeof sc->ifname)) {
uint64_t queue_size;
sc->queue_id = queue_id;
sc->enable = enable;
mpsc_queue_insert(&vhost_state_change_queue, &sc->node);
queue_size = atomic_count_inc64(&vhost_state_change_queue_size);
if (queue_size >= 1000) {
VLOG_WARN_RL(&vhost_rl, "vring state change queue has %"PRIu64" "
"entries. Last update was for socket %s.", queue_size,
sc->ifname);
}
} else {
free(sc);
}
return 0;
}
static void
destroy_connection(int vid)
{
struct netdev_dpdk *dev;
char ifname[IF_NAME_SZ];
bool exists = false;
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (nullable_string_is_equal(ifname, dev->vhost_id)) {
uint32_t qp_num = NR_QUEUE;
if (netdev_dpdk_get_vid(dev) >= 0) {
VLOG_ERR("Connection on socket '%s' destroyed while vhost "
"device still attached.", dev->vhost_id);
}
/* Restore the number of queue pairs to default. */
if (dev->requested_n_txq != qp_num
|| dev->requested_n_rxq != qp_num) {
dev->requested_n_rxq = qp_num;
dev->requested_n_txq = qp_num;
netdev_request_reconfigure(&dev->up);
}
if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) {
/* The socket disconnected before reaching new_device. It
* likely means that the guest did not agree with the virtio
* features. */
VLOG_WARN_RL(&rl, "Connection on socket '%s' closed during "
"initialization.", dev->vhost_id);
}
if (!(dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING)) {
switch (dev->virtio_features_state) {
case OVS_VIRTIO_F_CLEAN:
dev->virtio_features_state = OVS_VIRTIO_F_WORKAROUND;
break;
case OVS_VIRTIO_F_WORKAROUND:
dev->virtio_features_state = OVS_VIRTIO_F_CLEAN;
break;
case OVS_VIRTIO_F_CLEAN_NEGOTIATED:
/* The virtio features were clean and got accepted by the
* guest. We expect it will be the case in the future and
* change nothing. */
break;
case OVS_VIRTIO_F_WORKAROUND_NEGOTIATED:
/* Let's try to go with clean virtio features on a next
* connection. */
dev->virtio_features_state = OVS_VIRTIO_F_CLEAN;
break;
default:
OVS_NOT_REACHED();
}
if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) {
dev->virtio_features_state |= OVS_VIRTIO_F_RECONF_PENDING;
netdev_request_reconfigure(&dev->up);
}
}
ovs_mutex_unlock(&dev->mutex);
exists = true;
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
} else {
VLOG_INFO("vHost Device '%s' not found", ifname);
}
}
/*
* Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
* or vhostuserclient netdev.
*
* Returns a value greater or equal to zero for a valid vid or '-1' if
* there is no valid vid associated. A vid of '-1' must not be used in
* rte_vhost_ APi calls.
*
* Once obtained and validated, a vid can be used by a PMD for multiple
* subsequent rte_vhost API calls until the PMD quiesces. A PMD should
* not fetch the vid again for each of a series of API calls.
*/
int
netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
{
return ovsrcu_index_get(&dev->vid);
}
static int
netdev_dpdk_class_init(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
/* This function can be called for different classes. The initialization
* needs to be done only once */
if (ovsthread_once_start(&once)) {
int ret;
ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
unixctl_command_register("netdev-dpdk/set-admin-state",
"[netdev] up|down", 1, 2,
netdev_dpdk_set_admin_state, NULL);
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
unixctl_command_register("netdev-dpdk/detach",
"pci address of device", 1, 1,
netdev_dpdk_detach, NULL);
unixctl_command_register("netdev-dpdk/get-mempool-info",
"[netdev]", 0, 1,
netdev_dpdk_get_mempool_info, NULL);
netdev-dpdk: Trigger port reconfiguration in main thread for resets. When OVS (main thread) configures a DPDK netdev, it holds a netdev_dpdk mutex lock. As part of this configure operation, the net/iavf driver (used with i40e VF devices) triggers a queue count change. The PF entity (serviced by a kernel PF driver for example) handles this change and requests back that the VF driver resets the VF device. The driver then completes the VF reset operation on its side and waits for completion of the iavf-event thread responsible for handling various VF device events. On the other hand, handling of the VF reset request in this iavf-event thread results in notifying the application with a port reset request (RTE_ETH_EVENT_INTR_RESET). The OVS reset callback tries to take a hold of the same netdev_dpdk mutex and blocks the iavf-event thread. As a result, the net/iavf driver (still running on OVS main thread) is unable to complete as it is waiting for iavf-event to complete. To break from this situation, the OVS reset callback now won't take a netdev_dpdk mutex. Instead, the port reset request is stored in a simple RTE_ETH_MAXPORTS array associated to a seq object. This is enough to let the VF driver complete this port initialization. The OVS main thread later handles the port reset request. More details in the DPDK upstream bz as this issue appeared following a change in DPDK. Link: https://bugs.dpdk.org/show_bug.cgi?id=1337 Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-01-18 17:15:00 +01:00
netdev_dpdk_reset_seq = seq_create();
netdev_dpdk_last_reset_seq = seq_read(netdev_dpdk_reset_seq);
ret = rte_eth_dev_callback_register(RTE_ETH_ALL,
RTE_ETH_EVENT_INTR_RESET,
dpdk_eth_event_callback, NULL);
if (ret != 0) {
VLOG_ERR("Ethernet device callback register error: %s",
rte_strerror(-ret));
}
ovsthread_once_done(&once);
}
return 0;
}
static int
netdev_dpdk_vhost_class_init(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
if (ovsthread_once_start(&once)) {
ovs_thread_create("ovs_vhost", netdev_dpdk_vhost_events_main, NULL);
ovsthread_once_done(&once);
}
return 0;
}
/* QoS Functions */
struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
{
return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
}
/*
* Initialize QoS configuration operations.
*/
static void
qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
{
conf->ops = ops;
rte_spinlock_init(&conf->lock);
}
/*
* Search existing QoS operations in qos_ops and compare each set of
* operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
* else return NULL
*/
static const struct dpdk_qos_ops *
qos_lookup_name(const char *name)
{
const struct dpdk_qos_ops *const *opsp;
for (opsp = qos_confs; *opsp != NULL; opsp++) {
const struct dpdk_qos_ops *ops = *opsp;
if (!strcmp(name, ops->qos_name)) {
return ops;
}
}
return NULL;
}
static int
netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
struct sset *types)
{
const struct dpdk_qos_ops *const *opsp;
for (opsp = qos_confs; *opsp != NULL; opsp++) {
const struct dpdk_qos_ops *ops = *opsp;
if (ops->qos_construct && ops->qos_name[0] != '\0') {
sset_add(types, ops->qos_name);
}
}
return 0;
}
static int
netdev_dpdk_get_qos(const struct netdev *netdev,
const char **typep, struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf) {
*typep = qos_conf->ops->qos_name;
error = (qos_conf->ops->qos_get
? qos_conf->ops->qos_get(qos_conf, details): 0);
} else {
/* No QoS configuration set, return an empty string */
*typep = "";
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
const struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const struct dpdk_qos_ops *new_ops = NULL;
struct qos_conf *qos_conf, *new_qos_conf = NULL;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
new_ops = qos_lookup_name(type);
if (!new_ops || !new_ops->qos_construct) {
new_qos_conf = NULL;
if (type && type[0]) {
error = EOPNOTSUPP;
}
} else if (qos_conf && qos_conf->ops == new_ops
&& qos_conf->ops->qos_is_equal(qos_conf, details)) {
new_qos_conf = qos_conf;
} else {
error = new_ops->qos_construct(details, &new_qos_conf);
}
if (error) {
VLOG_ERR("Failed to set QoS type %s on port %s: %s",
type, netdev->name, rte_strerror(error));
}
if (new_qos_conf != qos_conf) {
ovsrcu_set(&dev->qos_conf, new_qos_conf);
if (qos_conf) {
ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
}
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_get_queue(const struct netdev *netdev, uint32_t queue_id,
struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (!qos_conf || !qos_conf->ops || !qos_conf->ops->qos_queue_get) {
error = EOPNOTSUPP;
} else {
error = qos_conf->ops->qos_queue_get(details, queue_id, qos_conf);
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_set_queue(struct netdev *netdev, uint32_t queue_id,
const struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (!qos_conf || !qos_conf->ops || !qos_conf->ops->qos_queue_construct) {
error = EOPNOTSUPP;
} else {
error = qos_conf->ops->qos_queue_construct(details, queue_id,
qos_conf);
}
if (error && error != EOPNOTSUPP) {
VLOG_ERR("Failed to set QoS queue %d on port %s: %s",
queue_id, netdev_get_name(netdev), rte_strerror(error));
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_delete_queue(struct netdev *netdev, uint32_t queue_id)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_destruct) {
qos_conf->ops->qos_queue_destruct(qos_conf, queue_id);
} else {
error = EOPNOTSUPP;
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_get_queue_stats(const struct netdev *netdev, uint32_t queue_id,
struct netdev_queue_stats *stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_get_stats) {
qos_conf->ops->qos_queue_get_stats(qos_conf, queue_id, stats);
} else {
error = EOPNOTSUPP;
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_queue_dump_start(const struct netdev *netdev, void **statep)
{
int error = 0;
struct qos_conf *qos_conf;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops
&& qos_conf->ops->qos_queue_dump_state_init) {
struct netdev_dpdk_queue_state *state;
*statep = state = xmalloc(sizeof *state);
error = qos_conf->ops->qos_queue_dump_state_init(qos_conf, state);
} else {
error = EOPNOTSUPP;
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_queue_dump_next(const struct netdev *netdev, void *state_,
uint32_t *queue_idp, struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct netdev_dpdk_queue_state *state = state_;
struct qos_conf *qos_conf;
int error = EOF;
ovs_mutex_lock(&dev->mutex);
while (state->cur_queue < state->n_queues) {
uint32_t queue_id = state->queues[state->cur_queue++];
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_get) {
*queue_idp = queue_id;
error = qos_conf->ops->qos_queue_get(details, queue_id, qos_conf);
break;
}
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
void *state_)
{
struct netdev_dpdk_queue_state *state = state_;
free(state->queues);
free(state);
return 0;
}
/* egress-policer details */
struct egress_policer {
struct qos_conf qos_conf;
struct rte_meter_srtcm_params app_srtcm_params;
struct rte_meter_srtcm egress_meter;
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
struct rte_meter_srtcm_profile egress_prof;
};
static void
egress_policer_details_to_param(const struct smap *details,
struct rte_meter_srtcm_params *params)
{
memset(params, 0, sizeof *params);
params->cir = smap_get_ullong(details, "cir", 0);
params->cbs = smap_get_ullong(details, "cbs", 0);
params->ebs = 0;
}
static int
egress_policer_qos_construct(const struct smap *details,
struct qos_conf **conf)
{
struct egress_policer *policer;
int err = 0;
policer = xmalloc(sizeof *policer);
qos_conf_init(&policer->qos_conf, &egress_policer_ops);
egress_policer_details_to_param(details, &policer->app_srtcm_params);
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
err = rte_meter_srtcm_profile_config(&policer->egress_prof,
&policer->app_srtcm_params);
if (!err) {
err = rte_meter_srtcm_config(&policer->egress_meter,
&policer->egress_prof);
}
if (!err) {
*conf = &policer->qos_conf;
} else {
dpdk: Update to use DPDK 18.11. This commit adds support for DPDK v18.11, it includes the following changes. 1. Enable compilation and linkage with dpdk 18.11.0 The following dpdk commits which were introduced after dpdk 17.11.x require OVS updates to accommodate to the dpdk changes. - ce17edde ("ethdev: introduce Rx queue offloads API") - ab3ce1e0 ("ethdev: remove old offload API") - c06ddf96 ("meter: add configuration profile") - e58638c3 ("ethdev: fix TPID handling in flow API") - cd8c7c7c ("ethdev: replace bus specific struct with generic dev") - ac8d22de ("ethdev: flatten RSS configuration in flow API") 2. Limit configured rss hash functions to only those supported by the eth device. 3. Set default RSS key in struct action_rss_data, required by OVS commit- e8a2b5bf ("netdev-dpdk: implement flow offload with rte flow") when configured with "other_config:hw-offload=true". 4. DEV_RX_OFFLOAD_CRC_STRIP has been removed from DPDK 18.11. DEV_RX_OFFLOAD_KEEP_CRC can now be used to keep the CRC. Use the correct flag and check it is supported. 5. rte_eth_dev_attach/detach have been removed from DPDK 18.11. Replace them with rte_dev_probe/remove. 6. Update docs and travis to use DPDK18.11. This commit squashes the following commits present on the dpdk-latest branch: 7f021f902bb3 ("netdev-dpdk: Upgrade to dpdk v18.08") 270d9216f1ed ("netdev-dpdk: Set scatter based on capabilities") bef2cdc8f412 ("netdev-dpdk: Fix returning the field of malloced struct.") 73c1a65167fc ("redhat: change variable used for non-root user support") eb485f60ce44 ("dpdk: Update to use DPDK 18.11.") For credit all authors of the original commits above have been added as co-authors for this commmit. From: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Ophir Munk <ophirmu@mellanox.com> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Timothy Redaelli <tredaelli@redhat.com> Co-authored-by: Timothy Redaelli <tredaelli@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-12-10 22:15:38 +00:00
VLOG_ERR("Could not create rte meter for egress policer");
free(policer);
*conf = NULL;
err = -err;
}
return err;
}
static void
egress_policer_qos_destruct(struct qos_conf *conf)
{
struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
qos_conf);
free(policer);
}
static int
egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
{
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
return 0;
}
static bool
egress_policer_qos_is_equal(const struct qos_conf *conf,
const struct smap *details)
{
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
struct rte_meter_srtcm_params params;
egress_policer_details_to_param(details, &params);
return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
}
static int
egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int cnt = 0;
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
cnt = srtcm_policer_run_single_packet(&policer->egress_meter,
&policer->egress_prof, pkts,
pkt_cnt, should_steal);
return cnt;
}
static const struct dpdk_qos_ops egress_policer_ops = {
.qos_name = "egress-policer", /* qos_name */
.qos_construct = egress_policer_qos_construct,
.qos_destruct = egress_policer_qos_destruct,
.qos_get = egress_policer_qos_get,
.qos_is_equal = egress_policer_qos_is_equal,
.qos_run = egress_policer_run
};
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
/* trtcm-policer details */
struct trtcm_policer {
struct qos_conf qos_conf;
struct rte_meter_trtcm_rfc4115_params meter_params;
struct rte_meter_trtcm_rfc4115_profile meter_profile;
struct rte_meter_trtcm_rfc4115 meter;
struct netdev_queue_stats stats;
struct hmap queues;
};
struct trtcm_policer_queue {
struct hmap_node hmap_node;
uint32_t queue_id;
struct rte_meter_trtcm_rfc4115_params meter_params;
struct rte_meter_trtcm_rfc4115_profile meter_profile;
struct rte_meter_trtcm_rfc4115 meter;
struct netdev_queue_stats stats;
};
static void
trtcm_policer_details_to_param(const struct smap *details,
struct rte_meter_trtcm_rfc4115_params *params)
{
memset(params, 0, sizeof *params);
params->cir = smap_get_ullong(details, "cir", 0);
params->eir = smap_get_ullong(details, "eir", 0);
params->cbs = smap_get_ullong(details, "cbs", 0);
params->ebs = smap_get_ullong(details, "ebs", 0);
}
static void
trtcm_policer_param_to_detail(
const struct rte_meter_trtcm_rfc4115_params *params,
struct smap *details)
{
smap_add_format(details, "cir", "%"PRIu64, params->cir);
smap_add_format(details, "eir", "%"PRIu64, params->eir);
smap_add_format(details, "cbs", "%"PRIu64, params->cbs);
smap_add_format(details, "ebs", "%"PRIu64, params->ebs);
}
static int
trtcm_policer_qos_construct(const struct smap *details,
struct qos_conf **conf)
{
struct trtcm_policer *policer;
int err = 0;
policer = xmalloc(sizeof *policer);
qos_conf_init(&policer->qos_conf, &trtcm_policer_ops);
trtcm_policer_details_to_param(details, &policer->meter_params);
err = rte_meter_trtcm_rfc4115_profile_config(&policer->meter_profile,
&policer->meter_params);
if (!err) {
err = rte_meter_trtcm_rfc4115_config(&policer->meter,
&policer->meter_profile);
}
if (!err) {
*conf = &policer->qos_conf;
memset(&policer->stats, 0, sizeof policer->stats);
hmap_init(&policer->queues);
} else {
free(policer);
*conf = NULL;
err = -err;
}
return err;
}
static void
trtcm_policer_qos_destruct(struct qos_conf *conf)
{
struct trtcm_policer_queue *queue;
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
HMAP_FOR_EACH_SAFE (queue, hmap_node, &policer->queues) {
netdev-dpdk: Add new DPDK RFC 4115 egress policer This patch adds a new policer to the DPDK datapath based on RFC 4115's Two-Rate, Three-Color marker. It's a two-level hierarchical policer which first does a color-blind marking of the traffic at the queue level, followed by a color-aware marking at the port level. At the end traffic marked as Green or Yellow is forwarded, Red is dropped. For details on how traffic is marked, see RFC 4115. This egress policer can be used to limit traffic at different rated based on the queues the traffic is in. In addition, it can also be used to prioritize certain traffic over others at a port level. For example, the following configuration will limit the traffic rate at a port level to a maximum of 2000 packets a second (64 bytes IPv4 packets). 100pps as CIR (Committed Information Rate) and 1000pps as EIR (Excess Information Rate). High priority traffic is routed to queue 10, which marks all traffic as CIR, i.e. Green. All low priority traffic, queue 20, is marked as EIR, i.e. Yellow. ovs-vsctl --timeout=5 set port dpdk1 qos=@myqos -- \ --id=@myqos create qos type=trtcm-policer \ other-config:cir=52000 other-config:cbs=2048 \ other-config:eir=52000 other-config:ebs=2048 \ queues:10=@dpdk1Q10 queues:20=@dpdk1Q20 -- \ --id=@dpdk1Q10 create queue \ other-config:cir=41600000 other-config:cbs=2048 \ other-config:eir=0 other-config:ebs=0 -- \ --id=@dpdk1Q20 create queue \ other-config:cir=0 other-config:cbs=0 \ other-config:eir=41600000 other-config:ebs=2048 \ This configuration accomplishes that the high priority traffic has a guaranteed bandwidth egressing the ports at CIR (1000pps), but it can also use the EIR, so a total of 2000pps at max. These additional 1000pps is shared with the low priority traffic. The low priority traffic can use at maximum 1000pps. Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2020-01-14 11:12:47 -05:00
hmap_remove(&policer->queues, &queue->hmap_node);
free(queue);
}
hmap_destroy(&policer->queues);
free(policer);
}
static int
trtcm_policer_qos_get(const struct qos_conf *conf, struct smap *details)
{
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
trtcm_policer_param_to_detail(&policer->meter_params, details);
return 0;
}
static bool
trtcm_policer_qos_is_equal(const struct qos_conf *conf,
const struct smap *details)
{
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
struct rte_meter_trtcm_rfc4115_params params;
trtcm_policer_details_to_param(details, &params);
return !memcmp(&params, &policer->meter_params, sizeof params);
}
static struct trtcm_policer_queue *
trtcm_policer_qos_find_queue(struct trtcm_policer *policer, uint32_t queue_id)
{
struct trtcm_policer_queue *queue;
HMAP_FOR_EACH_WITH_HASH (queue, hmap_node, hash_2words(queue_id, 0),
&policer->queues) {
if (queue->queue_id == queue_id) {
return queue;
}
}
return NULL;
}
static inline bool
trtcm_policer_run_single_packet(struct trtcm_policer *policer,
struct rte_mbuf *pkt, uint64_t time)
{
enum rte_color pkt_color;
struct trtcm_policer_queue *queue;
uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct rte_ether_hdr);
struct dp_packet *dpkt = CONTAINER_OF(pkt, struct dp_packet, mbuf);
queue = trtcm_policer_qos_find_queue(policer, dpkt->md.skb_priority);
if (!queue) {
/* If no queue is found, use the default queue, which MUST exist. */
queue = trtcm_policer_qos_find_queue(policer, 0);
if (!queue) {
return false;
}
}
pkt_color = rte_meter_trtcm_rfc4115_color_blind_check(&queue->meter,
&queue->meter_profile,
time,
pkt_len);
if (pkt_color == RTE_COLOR_RED) {
queue->stats.tx_errors++;
} else {
queue->stats.tx_bytes += pkt_len;
queue->stats.tx_packets++;
}
pkt_color = rte_meter_trtcm_rfc4115_color_aware_check(&policer->meter,
&policer->meter_profile,
time, pkt_len,
pkt_color);
if (pkt_color == RTE_COLOR_RED) {
policer->stats.tx_errors++;
return false;
}
policer->stats.tx_bytes += pkt_len;
policer->stats.tx_packets++;
return true;
}
static int
trtcm_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt = NULL;
uint64_t current_time = rte_rdtsc();
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
if (trtcm_policer_run_single_packet(policer, pkt, current_time)) {
if (cnt != i) {
pkts[cnt] = pkt;
}
cnt++;
} else {
if (should_steal) {
rte_pktmbuf_free(pkt);
}
}
}
return cnt;
}
static int
trtcm_policer_qos_queue_construct(const struct smap *details,
uint32_t queue_id, struct qos_conf *conf)
{
int err = 0;
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (!queue) {
queue = xmalloc(sizeof *queue);
queue->queue_id = queue_id;
memset(&queue->stats, 0, sizeof queue->stats);
queue->stats.created = time_msec();
hmap_insert(&policer->queues, &queue->hmap_node,
hash_2words(queue_id, 0));
}
if (queue_id == 0 && smap_is_empty(details)) {
/* No default queue configured, use port values */
memcpy(&queue->meter_params, &policer->meter_params,
sizeof queue->meter_params);
} else {
trtcm_policer_details_to_param(details, &queue->meter_params);
}
err = rte_meter_trtcm_rfc4115_profile_config(&queue->meter_profile,
&queue->meter_params);
if (!err) {
err = rte_meter_trtcm_rfc4115_config(&queue->meter,
&queue->meter_profile);
}
if (err) {
hmap_remove(&policer->queues, &queue->hmap_node);
free(queue);
err = -err;
}
return err;
}
static void
trtcm_policer_qos_queue_destruct(struct qos_conf *conf, uint32_t queue_id)
{
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (queue) {
hmap_remove(&policer->queues, &queue->hmap_node);
free(queue);
}
}
static int
trtcm_policer_qos_queue_get(struct smap *details, uint32_t queue_id,
const struct qos_conf *conf)
{
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (!queue) {
return EINVAL;
}
trtcm_policer_param_to_detail(&queue->meter_params, details);
return 0;
}
static int
trtcm_policer_qos_queue_get_stats(const struct qos_conf *conf,
uint32_t queue_id,
struct netdev_queue_stats *stats)
{
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (!queue) {
return EINVAL;
}
memcpy(stats, &queue->stats, sizeof *stats);
return 0;
}
static int
trtcm_policer_qos_queue_dump_state_init(const struct qos_conf *conf,
struct netdev_dpdk_queue_state *state)
{
uint32_t i = 0;
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
state->n_queues = hmap_count(&policer->queues);
state->cur_queue = 0;
state->queues = xmalloc(state->n_queues * sizeof *state->queues);
HMAP_FOR_EACH (queue, hmap_node, &policer->queues) {
state->queues[i++] = queue->queue_id;
}
return 0;
}
static const struct dpdk_qos_ops trtcm_policer_ops = {
.qos_name = "trtcm-policer",
.qos_construct = trtcm_policer_qos_construct,
.qos_destruct = trtcm_policer_qos_destruct,
.qos_get = trtcm_policer_qos_get,
.qos_is_equal = trtcm_policer_qos_is_equal,
.qos_run = trtcm_policer_run,
.qos_queue_construct = trtcm_policer_qos_queue_construct,
.qos_queue_destruct = trtcm_policer_qos_queue_destruct,
.qos_queue_get = trtcm_policer_qos_queue_get,
.qos_queue_get_stats = trtcm_policer_qos_queue_get_stats,
.qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init
};
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
static int
dpdk_rx_steer_add_flow(struct netdev_dpdk *dev,
const struct rte_flow_item items[],
const char *desc)
{
const struct rte_flow_attr attr = { .ingress = 1 };
const struct rte_flow_action actions[] = {
{
.type = RTE_FLOW_ACTION_TYPE_QUEUE,
.conf = &(const struct rte_flow_action_queue) {
.index = dev->up.n_rxq - 1,
},
},
{ .type = RTE_FLOW_ACTION_TYPE_END },
};
struct rte_flow_error error;
struct rte_flow *flow;
size_t num;
int err;
set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
err = rte_flow_validate(dev->port_id, &attr, items, actions, &error);
if (err) {
VLOG_WARN("%s: rx-steering: device does not support %s flow: %s",
netdev_get_name(&dev->up), desc,
error.message ? error.message : "");
goto out;
}
set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
flow = rte_flow_create(dev->port_id, &attr, items, actions, &error);
if (flow == NULL) {
VLOG_WARN("%s: rx-steering: failed to add %s flow: %s",
netdev_get_name(&dev->up), desc,
error.message ? error.message : "");
err = rte_errno;
goto out;
}
num = dev->rx_steer_flows_num + 1;
dev->rx_steer_flows = xrealloc(dev->rx_steer_flows, num * sizeof flow);
dev->rx_steer_flows[dev->rx_steer_flows_num] = flow;
dev->rx_steer_flows_num = num;
VLOG_INFO("%s: rx-steering: redirected %s traffic to rx queue %d",
netdev_get_name(&dev->up), desc, dev->up.n_rxq - 1);
out:
return err;
}
#define RETA_CONF_SIZE (RTE_ETH_RSS_RETA_SIZE_512 / RTE_ETH_RETA_GROUP_SIZE)
static int
dpdk_rx_steer_rss_configure(struct netdev_dpdk *dev, int rss_n_rxq)
{
struct rte_eth_rss_reta_entry64 reta_conf[RETA_CONF_SIZE];
struct rte_eth_dev_info info;
int err;
rte_eth_dev_info_get(dev->port_id, &info);
if (info.reta_size % rss_n_rxq != 0 &&
info.reta_size < RTE_ETH_RSS_RETA_SIZE_128) {
/*
* Some drivers set reta_size equal to the total number of rxqs that
* are configured when it is a power of two. Since we are actually
* reconfiguring the redirection table to exclude the last rxq, we may
* end up with an imbalanced redirection table. For example, such
* configuration:
*
* options:n_rxq=3 options:rx-steering=rss+lacp
*
* Will actually configure 4 rxqs on the NIC, and the default reta to:
*
* [0, 1, 2, 3]
*
* And dpdk_rx_steer_rss_configure() will reconfigure reta to:
*
* [0, 1, 2, 0]
*
* Causing queue 0 to receive twice as much traffic as queues 1 and 2.
*
* Work around that corner case by forcing a bigger redirection table
* size to 128 entries when reta_size is not a multiple of rss_n_rxq
* and when reta_size is less than 128. This value seems to be
* supported by most of the drivers that also support rte_flow.
*/
info.reta_size = RTE_ETH_RSS_RETA_SIZE_128;
}
memset(reta_conf, 0, sizeof reta_conf);
for (uint16_t i = 0; i < info.reta_size; i++) {
uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
reta_conf[idx].mask |= 1ULL << shift;
reta_conf[idx].reta[shift] = i % rss_n_rxq;
}
err = rte_eth_dev_rss_reta_update(dev->port_id, reta_conf, info.reta_size);
if (err < 0) {
VLOG_WARN("%s: failed to configure RSS redirection table: err=%d",
netdev_get_name(&dev->up), err);
}
return err;
}
static int
dpdk_rx_steer_configure(struct netdev_dpdk *dev)
{
int err = 0;
if (dev->up.n_rxq < 2) {
err = ENOTSUP;
VLOG_WARN("%s: rx-steering: not enough available rx queues",
netdev_get_name(&dev->up));
goto out;
}
if (dev->requested_rx_steer_flags & DPDK_RX_STEER_LACP) {
const struct rte_flow_item items[] = {
{
.type = RTE_FLOW_ITEM_TYPE_ETH,
.spec = &(const struct rte_flow_item_eth){
.type = htons(ETH_TYPE_LACP),
},
.mask = &(const struct rte_flow_item_eth){
.type = htons(0xffff),
},
},
{ .type = RTE_FLOW_ITEM_TYPE_END },
};
err = dpdk_rx_steer_add_flow(dev, items, "lacp");
if (err) {
goto out;
}
}
if (dev->rx_steer_flows_num) {
/* Reconfigure RSS reta in all but the rx steering queue. */
err = dpdk_rx_steer_rss_configure(dev, dev->up.n_rxq - 1);
if (err) {
goto out;
}
if (dev->up.n_rxq == 2) {
VLOG_INFO("%s: rx-steering: redirected other traffic to "
"rx queue 0", netdev_get_name(&dev->up));
} else {
VLOG_INFO("%s: rx-steering: applied rss on rx queues 0-%u",
netdev_get_name(&dev->up), dev->up.n_rxq - 2);
}
}
out:
return err;
}
static void
dpdk_rx_steer_unconfigure(struct netdev_dpdk *dev)
{
struct rte_flow_error error;
if (!dev->rx_steer_flows_num) {
return;
}
VLOG_DBG("%s: rx-steering: reset flows", netdev_get_name(&dev->up));
for (int i = 0; i < dev->rx_steer_flows_num; i++) {
set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
if (rte_flow_destroy(dev->port_id, dev->rx_steer_flows[i], &error)) {
VLOG_WARN("%s: rx-steering: failed to destroy flow: %s",
netdev_get_name(&dev->up),
error.message ? error.message : "");
}
}
free(dev->rx_steer_flows);
dev->rx_steer_flows_num = 0;
dev->rx_steer_flows = NULL;
/*
* Most DPDK drivers seem to reset their RSS redirection table in
* rte_eth_dev_configure() or rte_eth_dev_start(), both of which are
* called in dpdk_eth_dev_init(). No need to explicitly reset it.
*/
}
static int
netdev_dpdk_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
bool try_rx_steer;
int err = 0;
ovs_mutex_lock(&dev->mutex);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
try_rx_steer = dev->requested_rx_steer_flags != 0;
dev->requested_n_rxq = dev->user_n_rxq;
if (try_rx_steer) {
dev->requested_n_rxq += 1;
}
if (netdev->n_txq == dev->requested_n_txq
&& netdev->n_rxq == dev->requested_n_rxq
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
&& dev->rx_steer_flags == dev->requested_rx_steer_flags
&& dev->mtu == dev->requested_mtu
&& dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
&& dev->rxq_size == dev->requested_rxq_size
&& dev->txq_size == dev->requested_txq_size
&& eth_addr_equals(dev->hwaddr, dev->requested_hwaddr)
&& dev->socket_id == dev->requested_socket_id
&& dev->started && !dev->reset_needed) {
/* Reconfiguration is unnecessary */
goto out;
}
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
retry:
dpdk_rx_steer_unconfigure(dev);
if (dev->reset_needed) {
rte_eth_dev_reset(dev->port_id);
if_notifier_manual_report();
dev->reset_needed = false;
} else {
rte_eth_dev_stop(dev->port_id);
}
dev->started = false;
err = netdev_dpdk_mempool_configure(dev);
if (err && err != EEXIST) {
goto out;
}
dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;
netdev->n_txq = dev->requested_n_txq;
netdev->n_rxq = dev->requested_n_rxq;
dev->rxq_size = dev->requested_rxq_size;
dev->txq_size = dev->requested_txq_size;
rte_free(dev->tx_q);
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
dev->tx_q = NULL;
if (!eth_addr_equals(dev->hwaddr, dev->requested_hwaddr)) {
err = netdev_dpdk_set_etheraddr__(dev, dev->requested_hwaddr);
if (err) {
goto out;
}
}
err = dpdk_eth_dev_init(dev);
netdev_dpdk_update_netdev_flags(dev);
/* If both requested and actual hwaddr were previously
* unset (initialized to 0), then first device init above
* will have set actual hwaddr to something new.
* This would trigger spurious MAC reconfiguration unless
* the requested MAC is kept in sync.
*
* This is harmless in case requested_hwaddr was
* configured by the user, as netdev_dpdk_set_etheraddr__()
* will have succeeded to get to this point.
*/
dev->requested_hwaddr = dev->hwaddr;
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
if (try_rx_steer) {
err = dpdk_rx_steer_configure(dev);
if (err) {
/* No hw support, disable & recover gracefully. */
try_rx_steer = false;
/*
* The extra queue must be explicitly removed here to ensure that
* it is unconfigured immediately.
*/
dev->requested_n_rxq = dev->user_n_rxq;
goto retry;
}
} else {
VLOG_INFO("%s: rx-steering: default rss", netdev_get_name(&dev->up));
}
dev->rx_steer_flags = dev->requested_rx_steer_flags;
dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
if (!dev->tx_q) {
err = ENOMEM;
}
netdev_change_seq_changed(netdev);
out:
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
dev->up.n_txq = dev->requested_n_txq;
dev->up.n_rxq = dev->requested_n_rxq;
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
/* Always keep RX queue 0 enabled for implementations that won't
* report vring states. */
dev->vhost_rxq_enabled[0] = true;
/* Enable TX queue 0 by default if it wasn't disabled. */
if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
dev->tx_q[0].map = 0;
}
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
rte_spinlock_lock(&dev->stats_lock);
memset(&dev->stats, 0, sizeof dev->stats);
memset(dev->sw_stats, 0, sizeof *dev->sw_stats);
rte_spinlock_unlock(&dev->stats_lock);
netdev_dpdk_remap_txqs(dev);
if (netdev_dpdk_get_vid(dev) >= 0) {
int err;
err = netdev_dpdk_mempool_configure(dev);
if (!err) {
/* A new mempool was created or re-used. */
netdev_change_seq_changed(&dev->up);
} else if (err != EEXIST) {
return err;
}
if (dev->vhost_reconfigured == false) {
dev->vhost_reconfigured = true;
/* Carrier status may need updating. */
netdev_change_seq_changed(&dev->up);
}
}
netdev_dpdk_update_netdev_flags(dev);
return 0;
}
static int
netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err;
ovs_mutex_lock(&dev->mutex);
err = dpdk_vhost_reconfigure_helper(dev);
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bool unregister = false;
char *vhost_id;
int err;
ovs_mutex_lock(&dev->mutex);
if (dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT && dev->vhost_id
&& dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING) {
/* This vhost-user port was registered to the vhost library already,
* but a socket disconnection happened and configuration must be
* re-evaluated wrt dev->virtio_features_state. */
dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
vhost_id = dev->vhost_id;
unregister = true;
}
ovs_mutex_unlock(&dev->mutex);
if (unregister) {
dpdk_vhost_driver_unregister(dev, vhost_id);
}
ovs_mutex_lock(&dev->mutex);
/* Configure vHost client mode if requested and if the following criteria
* are met:
* 1. Device hasn't been registered yet.
* 2. A path has been specified.
*/
if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) {
uint64_t virtio_unsup_features = 0;
uint64_t vhost_flags = 0;
bool enable_tso;
enable_tso = userspace_tso_enabled()
&& dev->virtio_features_state & OVS_VIRTIO_F_CLEAN;
dev->virtio_features_state &= ~OVS_VIRTIO_F_RECONF_PENDING;
/* Register client-mode device. */
vhost_flags |= RTE_VHOST_USER_CLIENT;
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
/* Extended per vq statistics. */
vhost_flags |= RTE_VHOST_USER_NET_STATS_ENABLE;
/* There is no support for multi-segments buffers. */
vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
/* Enable IOMMU support, if explicitly requested. */
if (vhost_iommu_enabled) {
vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
}
/* Enable POSTCOPY support, if explicitly requested. */
if (vhost_postcopy_enabled) {
vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
}
/* Enable External Buffers if TCP Segmentation Offload is enabled. */
if (enable_tso) {
vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
}
err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
if (err) {
VLOG_ERR("vhost-user device setup failure for device %s\n",
dev->vhost_id);
goto unlock;
} else {
/* Configuration successful */
dev->vhost_driver_flags |= vhost_flags;
VLOG_INFO("vHost User device '%s' created in 'client' mode, "
"using client socket '%s'",
dev->up.name, dev->vhost_id);
}
err = rte_vhost_driver_callback_register(dev->vhost_id,
&virtio_net_device_ops);
if (err) {
VLOG_ERR("rte_vhost_driver_callback_register failed for "
"vhost user client port: %s\n", dev->up.name);
goto unlock;
}
if (enable_tso) {
virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_ECN
| 1ULL << VIRTIO_NET_F_HOST_UFO;
VLOG_DBG("%s: TSO enabled on vhost port",
netdev_get_name(&dev->up));
} else {
/* Advertise checksum offloading to the guest, but explicitly
* disable TSO and friends.
* NOTE: we can't disable HOST_ECN which may have been wrongly
* negotiated by a running guest. */
virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_TSO4
| 1ULL << VIRTIO_NET_F_HOST_TSO6
| 1ULL << VIRTIO_NET_F_HOST_UFO;
}
err = rte_vhost_driver_disable_features(dev->vhost_id,
virtio_unsup_features);
if (err) {
VLOG_ERR("rte_vhost_driver_disable_features failed for "
"vhost user client port: %s\n", dev->up.name);
goto unlock;
}
err = rte_vhost_driver_start(dev->vhost_id);
if (err) {
VLOG_ERR("rte_vhost_driver_start failed for vhost user "
"client port: %s\n", dev->up.name);
goto unlock;
}
}
err = dpdk_vhost_reconfigure_helper(dev);
unlock:
ovs_mutex_unlock(&dev->mutex);
return err;
}
int
netdev_dpdk_get_port_id(struct netdev *netdev)
{
struct netdev_dpdk *dev;
int ret = -1;
if (!is_dpdk_class(netdev->netdev_class)) {
goto out;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = dev->port_id;
ovs_mutex_unlock(&dev->mutex);
out:
return ret;
}
bool
netdev_dpdk_flow_api_supported(struct netdev *netdev)
{
struct netdev_dpdk *dev;
bool ret = false;
if ((!strcmp(netdev_get_type(netdev), "vxlan") ||
!strcmp(netdev_get_type(netdev), "gre")) &&
!strcmp(netdev_get_dpif_type(netdev), "netdev")) {
ret = true;
goto out;
}
if (!is_dpdk_class(netdev->netdev_class)) {
goto out;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->type == DPDK_DEV_ETH) {
netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Robin Jarry <rjarry@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-04 21:59:56 +02:00
if (dev->requested_rx_steer_flags) {
VLOG_WARN("%s: rx-steering is mutually exclusive with hw-offload,"
" falling back to default rss mode",
netdev_get_name(netdev));
dev->requested_rx_steer_flags = 0;
netdev_request_reconfigure(netdev);
}
/* TODO: Check if we able to offload some minimal flow. */
ret = true;
}
ovs_mutex_unlock(&dev->mutex);
out:
return ret;
}
int
netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
struct rte_flow *rte_flow,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int ret;
ret = rte_flow_destroy(dev->port_id, rte_flow, error);
return ret;
}
struct rte_flow *
netdev_dpdk_rte_flow_create(struct netdev *netdev,
const struct rte_flow_attr *attr,
const struct rte_flow_item *items,
const struct rte_flow_action *actions,
struct rte_flow_error *error)
{
struct rte_flow *flow;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
flow = rte_flow_create(dev->port_id, attr, items, actions, error);
return flow;
}
int
netdev_dpdk_rte_flow_query_count(struct netdev *netdev,
struct rte_flow *rte_flow,
struct rte_flow_query_count *query,
struct rte_flow_error *error)
{
struct rte_flow_action_count count = { .id = 0, };
const struct rte_flow_action actions[] = {
{
.type = RTE_FLOW_ACTION_TYPE_COUNT,
.conf = &count,
},
{
.type = RTE_FLOW_ACTION_TYPE_END,
},
};
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
return ret;
}
#ifdef ALLOW_EXPERIMENTAL_API
int
netdev_dpdk_rte_flow_tunnel_decap_set(struct netdev *netdev,
struct rte_flow_tunnel *tunnel,
struct rte_flow_action **actions,
uint32_t *num_of_actions,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
num_of_actions, error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_tunnel_match(struct netdev *netdev,
struct rte_flow_tunnel *tunnel,
struct rte_flow_item **items,
uint32_t *num_of_items,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_match(dev->port_id, tunnel, items, num_of_items,
error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_get_restore_info(struct netdev *netdev,
struct dp_packet *p,
struct rte_flow_restore_info *info,
struct rte_flow_error *error)
{
struct rte_mbuf *m = (struct rte_mbuf *) p;
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_tunnel_action_decap_release(
struct netdev *netdev,
struct rte_flow_action *actions,
uint32_t num_of_actions,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
num_of_actions, error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev,
struct rte_flow_item *items,
uint32_t num_of_items,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_item_release(dev->port_id, items, num_of_items,
error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
#endif /* ALLOW_EXPERIMENTAL_API */
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
static void
parse_mempool_config(const struct smap *ovs_other_config)
{
per_port_memory = smap_get_bool(ovs_other_config,
"per-port-memory", false);
VLOG_INFO("Per port memory for DPDK devices %s.",
per_port_memory ? "enabled" : "disabled");
}
static void
parse_user_mempools_list(const struct smap *ovs_other_config)
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
{
const char *mtus = smap_get(ovs_other_config, "shared-mempool-config");
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
char *list, *copy, *key, *value;
int error = 0;
if (!mtus) {
return;
}
n_user_mempools = 0;
list = copy = xstrdup(mtus);
while (ofputil_parse_key_value(&list, &key, &value)) {
int socket_id, mtu, adj_mtu;
if (!str_to_int(key, 0, &mtu) || mtu < 0) {
error = EINVAL;
VLOG_WARN("Invalid user configured shared mempool MTU.");
break;
}
if (!str_to_int(value, 0, &socket_id)) {
/* No socket specified. It will apply for all numas. */
socket_id = INT_MAX;
} else if (socket_id < 0) {
error = EINVAL;
VLOG_WARN("Invalid user configured shared mempool NUMA.");
break;
}
user_mempools = xrealloc(user_mempools, (n_user_mempools + 1) *
sizeof(struct user_mempool_config));
adj_mtu = FRAME_LEN_TO_MTU(dpdk_buf_size(mtu));
user_mempools[n_user_mempools].adj_mtu = adj_mtu;
user_mempools[n_user_mempools].socket_id = socket_id;
n_user_mempools++;
VLOG_INFO("User configured shared mempool set for: MTU %d, NUMA %s.",
mtu, socket_id == INT_MAX ? "ALL" : value);
}
if (error) {
VLOG_WARN("User configured shared mempools will not be used.");
n_user_mempools = 0;
free(user_mempools);
user_mempools = NULL;
}
free(copy);
}
static int
process_vhost_flags(char *flag, const char *default_val, int size,
const struct smap *ovs_other_config,
char **new_val)
{
const char *val;
int changed = 0;
val = smap_get(ovs_other_config, flag);
/* Process the vhost-sock-dir flag if it is provided, otherwise resort to
* default value.
*/
if (val && (strlen(val) <= size)) {
changed = 1;
*new_val = xstrdup(val);
VLOG_INFO("User-provided %s in use: %s", flag, *new_val);
} else {
VLOG_INFO("No %s provided - defaulting to %s", flag, default_val);
*new_val = xstrdup(default_val);
}
return changed;
}
static void
parse_vhost_config(const struct smap *ovs_other_config)
{
char *sock_dir_subcomponent;
if (process_vhost_flags("vhost-sock-dir", ovs_rundir(),
NAME_MAX, ovs_other_config,
&sock_dir_subcomponent)) {
struct stat s;
if (!strstr(sock_dir_subcomponent, "..")) {
vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(),
sock_dir_subcomponent);
if (stat(vhost_sock_dir, &s)) {
VLOG_ERR("vhost-user sock directory '%s' does not exist.",
vhost_sock_dir);
}
} else {
vhost_sock_dir = xstrdup(ovs_rundir());
VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid"
"characters '..' - using %s instead.",
ovs_rundir(), sock_dir_subcomponent, ovs_rundir());
}
free(sock_dir_subcomponent);
} else {
vhost_sock_dir = sock_dir_subcomponent;
}
vhost_iommu_enabled = smap_get_bool(ovs_other_config,
"vhost-iommu-support", false);
VLOG_INFO("IOMMU support for vhost-user-client %s.",
vhost_iommu_enabled ? "enabled" : "disabled");
vhost_postcopy_enabled = smap_get_bool(ovs_other_config,
"vhost-postcopy-support", false);
if (vhost_postcopy_enabled && memory_locked()) {
VLOG_WARN("vhost-postcopy-support and mlockall are not compatible.");
vhost_postcopy_enabled = false;
}
VLOG_INFO("POSTCOPY support for vhost-user-client %s.",
vhost_postcopy_enabled ? "enabled" : "disabled");
}
#define NETDEV_DPDK_CLASS_COMMON \
.is_pmd = true, \
.alloc = netdev_dpdk_alloc, \
.dealloc = netdev_dpdk_dealloc, \
.get_numa_id = netdev_dpdk_get_numa_id, \
.set_etheraddr = netdev_dpdk_set_etheraddr, \
.get_etheraddr = netdev_dpdk_get_etheraddr, \
.get_mtu = netdev_dpdk_get_mtu, \
.set_mtu = netdev_dpdk_set_mtu, \
.get_ifindex = netdev_dpdk_get_ifindex, \
.get_carrier_resets = netdev_dpdk_get_carrier_resets, \
.set_miimon_interval = netdev_dpdk_set_miimon, \
.set_policing = netdev_dpdk_set_policing, \
.get_qos_types = netdev_dpdk_get_qos_types, \
.get_qos = netdev_dpdk_get_qos, \
.set_qos = netdev_dpdk_set_qos, \
.get_queue = netdev_dpdk_get_queue, \
.set_queue = netdev_dpdk_set_queue, \
.delete_queue = netdev_dpdk_delete_queue, \
.get_queue_stats = netdev_dpdk_get_queue_stats, \
.queue_dump_start = netdev_dpdk_queue_dump_start, \
.queue_dump_next = netdev_dpdk_queue_dump_next, \
.queue_dump_done = netdev_dpdk_queue_dump_done, \
.update_flags = netdev_dpdk_update_flags, \
.rxq_alloc = netdev_dpdk_rxq_alloc, \
.rxq_construct = netdev_dpdk_rxq_construct, \
.rxq_destruct = netdev_dpdk_rxq_destruct, \
.rxq_dealloc = netdev_dpdk_rxq_dealloc
#define NETDEV_DPDK_CLASS_BASE \
NETDEV_DPDK_CLASS_COMMON, \
.init = netdev_dpdk_class_init, \
netdev-dpdk: Trigger port reconfiguration in main thread for resets. When OVS (main thread) configures a DPDK netdev, it holds a netdev_dpdk mutex lock. As part of this configure operation, the net/iavf driver (used with i40e VF devices) triggers a queue count change. The PF entity (serviced by a kernel PF driver for example) handles this change and requests back that the VF driver resets the VF device. The driver then completes the VF reset operation on its side and waits for completion of the iavf-event thread responsible for handling various VF device events. On the other hand, handling of the VF reset request in this iavf-event thread results in notifying the application with a port reset request (RTE_ETH_EVENT_INTR_RESET). The OVS reset callback tries to take a hold of the same netdev_dpdk mutex and blocks the iavf-event thread. As a result, the net/iavf driver (still running on OVS main thread) is unable to complete as it is waiting for iavf-event to complete. To break from this situation, the OVS reset callback now won't take a netdev_dpdk mutex. Instead, the port reset request is stored in a simple RTE_ETH_MAXPORTS array associated to a seq object. This is enough to let the VF driver complete this port initialization. The OVS main thread later handles the port reset request. More details in the DPDK upstream bz as this issue appeared following a change in DPDK. Link: https://bugs.dpdk.org/show_bug.cgi?id=1337 Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-01-18 17:15:00 +01:00
.run = netdev_dpdk_run, \
.wait = netdev_dpdk_wait, \
.destruct = netdev_dpdk_destruct, \
.set_tx_multiq = netdev_dpdk_set_tx_multiq, \
.get_carrier = netdev_dpdk_get_carrier, \
.get_stats = netdev_dpdk_get_stats, \
.get_custom_stats = netdev_dpdk_get_custom_stats, \
.get_features = netdev_dpdk_get_features, \
netdev: Add netdev_get_speed() to netdev API. Currently, the netdev's speed is being calculated by taking the link's feature bits (using netdev_get_features()) and transforming them into bps. This mechanism can be both inaccurate and difficult to maintain, mainly because we currently use the feature bits supported by OpenFlow which would have to be extended to support all new feature bits of all netdev implementations while keeping the OpenFlow API intact. In order to expose the link speed accurately for all current and future hardware, add a new netdev API call that allows the implementations to provide the current and maximum link speeds in Mbps. Internally, the logic to get the maximum supported speed still relies on feature bits so it might still get out of sync in the future. However, the maximum configurable speed is not used as much as the current speed and these feature bits are not exposed through the netdev interface so it should be easier to add more. Use this new function instead of netdev_get_features() where the link speed is needed. As a consequence of this patch, link speeds of cards is properly reported (internally in OVSDB) even if not supported by OpenFlow. A test verifies this behavior using a tap device. Also, in order to avoid using the old, this patch adds a checkpatch.py warning if the old API is used. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2137567 Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Adrian Moreno <amorenoz@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-17 10:08:11 +02:00
.get_speed = netdev_dpdk_get_speed, \
.get_status = netdev_dpdk_get_status, \
.reconfigure = netdev_dpdk_reconfigure, \
.rxq_recv = netdev_dpdk_rxq_recv
static const struct netdev_class dpdk_class = {
.type = "dpdk",
NETDEV_DPDK_CLASS_BASE,
.construct = netdev_dpdk_construct,
.get_config = netdev_dpdk_get_config,
.set_config = netdev_dpdk_set_config,
.send = netdev_dpdk_eth_send,
};
static const struct netdev_class dpdk_vhost_class = {
.type = "dpdkvhostuser",
NETDEV_DPDK_CLASS_COMMON,
.init = netdev_dpdk_vhost_class_init,
.construct = netdev_dpdk_vhost_construct,
.destruct = netdev_dpdk_vhost_destruct,
.send = netdev_dpdk_vhost_send,
.get_carrier = netdev_dpdk_vhost_get_carrier,
.get_stats = netdev_dpdk_vhost_get_stats,
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
.get_custom_stats = netdev_dpdk_vhost_get_custom_stats,
.get_status = netdev_dpdk_vhost_user_get_status,
.reconfigure = netdev_dpdk_vhost_reconfigure,
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
.rxq_recv = netdev_dpdk_vhost_rxq_recv,
.rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
};
static const struct netdev_class dpdk_vhost_client_class = {
.type = "dpdkvhostuserclient",
NETDEV_DPDK_CLASS_COMMON,
.init = netdev_dpdk_vhost_class_init,
.construct = netdev_dpdk_vhost_client_construct,
.destruct = netdev_dpdk_vhost_destruct,
.get_config = netdev_dpdk_vhost_client_get_config,
.set_config = netdev_dpdk_vhost_client_set_config,
.send = netdev_dpdk_vhost_send,
.get_carrier = netdev_dpdk_vhost_get_carrier,
.get_stats = netdev_dpdk_vhost_get_stats,
netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Signed-off-by: David Marchand <david.marchand@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-06 16:58:55 +01:00
.get_custom_stats = netdev_dpdk_vhost_get_custom_stats,
.get_status = netdev_dpdk_vhost_user_get_status,
.reconfigure = netdev_dpdk_vhost_client_reconfigure,
dpif-netdev: Only poll enabled vhost queues. We currently poll all available queues based on the max queue count exchanged with the vhost peer and rely on the vhost library in DPDK to check the vring status beneath. This can lead to some overhead when we have a lot of unused queues. To enhance the situation, we can skip the disabled queues. On rxq notifications, we make use of the netdev's change_seq number so that the pmd thread main loop can cache the queue state periodically. $ ovs-appctl dpif-netdev/pmd-rxq-show pmd thread numa_id 0 core_id 1: isolated : true port: dpdk0 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 2: isolated : true port: vhost1 queue-id: 0 (enabled) pmd usage: 0 % port: vhost3 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 15: isolated : true port: dpdk1 queue-id: 0 (enabled) pmd usage: 0 % pmd thread numa_id 0 core_id 16: isolated : true port: vhost0 queue-id: 0 (enabled) pmd usage: 0 % port: vhost2 queue-id: 0 (enabled) pmd usage: 0 % $ while true; do ovs-appctl dpif-netdev/pmd-rxq-show |awk ' /port: / { tot++; if ($5 == "(enabled)") { en++; } } END { print "total: " tot ", enabled: " en }' sleep 1 done total: 6, enabled: 2 total: 6, enabled: 2 ... # Started vm, virtio devices are bound to kernel driver which enables # F_MQ + all queue pairs total: 6, enabled: 2 total: 66, enabled: 66 ... # Unbound vhost0 and vhost1 from the kernel driver total: 66, enabled: 66 total: 66, enabled: 34 ... # Configured kernel bound devices to use only 1 queue pair total: 66, enabled: 34 total: 66, enabled: 19 total: 66, enabled: 4 ... # While rebooting the vm total: 66, enabled: 4 total: 66, enabled: 2 ... total: 66, enabled: 66 ... # After shutting down the vm total: 66, enabled: 66 total: 66, enabled: 2 Signed-off-by: David Marchand <david.marchand@redhat.com> Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2019-04-25 17:22:08 +02:00
.rxq_recv = netdev_dpdk_vhost_rxq_recv,
.rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
};
void
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
netdev_dpdk_register(const struct smap *ovs_other_config)
{
parse_mempool_config(ovs_other_config);
parse_user_mempools_list(ovs_other_config);
parse_vhost_config(ovs_other_config);
netdev-dpdk: Add shared mempool config. Mempools may currently be shared between DPDK ports based on port MTU and NUMA. With some hint from the user we can increase the sharing on MTU and hence reduce memory consumption in many cases. For example, a port with MTU 9000, uses a mempool with an mbuf size based on 9000 MTU. A port with MTU 1500, uses a different mempool with an mbuf size based on 1500 MTU. In this case, assuming same NUMA, both these ports could share the 9000 MTU mempool. The user must give a hint as order of creation of ports and setting of MTUs may vary and we need to ensure that upgrades from older OVS versions do not require more memory. This scheme can also prevent multiple mempools being created for cases where a port is added picking up a default MTU and an appropriate mempool, but later has it's MTU changed to a different value requiring a different mempool. Example usage: $ ovs-vsctl --no-wait set Open_vSwitch . \ other_config:shared-mempool-config=9000,1500:1,6000:1 Port added on NUMA 0: * MTU 1500, use mempool based on 9000 MTU * MTU 5000, use mempool based on 9000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Port added on NUMA 1: * MTU 1500, use mempool based on 1500 MTU * MTU 5000, use mempool based on 6000 MTU * MTU 9000, use mempool based on 9000 MTU * MTU 9300, use mempool based on 9300 MTU (existing behaviour) Default behaviour is unchanged and mempools are still only created when needed. Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Reviewed-by: David Marchand <david.marchand@redhat.com> Acked-by: Sunil Pai G <sunil.pai.g@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2022-06-24 11:13:23 +01:00
netdev_register_provider(&dpdk_class);
netdev_register_provider(&dpdk_vhost_class);
netdev_register_provider(&dpdk_vhost_client_class);
}