2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00
ovs/lib/netdev-dpdk.c

4870 lines
154 KiB
C
Raw Normal View History

/*
* Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "netdev-dpdk.h"
#include <string.h>
#include <signal.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <linux/virtio_net.h>
#include <sys/socket.h>
#include <linux/if.h>
#include <rte_bus_pci.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_errno.h>
#include <rte_eth_ring.h>
#include <rte_ethdev.h>
#include <rte_malloc.h>
#include <rte_mbuf.h>
#include <rte_meter.h>
#include <rte_pci.h>
#include <rte_vhost.h>
#include <rte_version.h>
#include <rte_flow.h>
#include "cmap.h"
#include "dirs.h"
#include "dp-packet.h"
#include "dpdk.h"
#include "dpif-netdev.h"
#include "fatal-signal.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "odp-util.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/vlog.h"
#include "openvswitch/match.h"
#include "ovs-numa.h"
#include "ovs-thread.h"
#include "ovs-rcu.h"
#include "packets.h"
#include "openvswitch/shash.h"
#include "smap.h"
#include "sset.h"
#include "unaligned.h"
#include "timeval.h"
#include "uuid.h"
#include "unixctl.h"
enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
#define DPDK_PORT_WATCHDOG_INTERVAL 5
#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
#define OVS_VPORT_DPDK "ovs_dpdk"
/*
* need to reserve tons of extra space in the mbufs so we can align the
* DMA addresses to 4KB.
* The minimum mbuf size is limited to avoid scatter behaviour and drop in
* performance for standard Ethernet MTU.
*/
#define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN \
+ (2 * VLAN_HEADER_LEN))
#define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
- ETHER_HDR_LEN - ETHER_CRC_LEN)
#define MBUF_SIZE(mtu) ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \
+ sizeof(struct dp_packet) \
+ RTE_PKTMBUF_HEADROOM), \
RTE_CACHE_LINE_SIZE)
#define NETDEV_DPDK_MBUF_ALIGN 1024
#define NETDEV_DPDK_MAX_PKT_LEN 9728
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Max and min number of packets in the mempool. OVS tries to allocate a
* mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
* enough hugepages) we keep halving the number until the allocation succeeds
* or we reach MIN_NB_MBUF */
#define MAX_NB_MBUF (4096 * 64)
#define MIN_NB_MBUF (4096 * 4)
#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
== 0);
/* The smallest possible NB_MBUF that we're going to try should be a multiple
* of MP_CACHE_SZ. This is advised by DPDK documentation. */
BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
% MP_CACHE_SZ == 0);
/*
* DPDK XSTATS Counter names definition
*/
#define XSTAT_RX_64_PACKETS "rx_size_64_packets"
#define XSTAT_RX_65_TO_127_PACKETS "rx_size_65_to_127_packets"
#define XSTAT_RX_128_TO_255_PACKETS "rx_size_128_to_255_packets"
#define XSTAT_RX_256_TO_511_PACKETS "rx_size_256_to_511_packets"
#define XSTAT_RX_512_TO_1023_PACKETS "rx_size_512_to_1023_packets"
#define XSTAT_RX_1024_TO_1522_PACKETS "rx_size_1024_to_1522_packets"
#define XSTAT_RX_1523_TO_MAX_PACKETS "rx_size_1523_to_max_packets"
#define XSTAT_TX_64_PACKETS "tx_size_64_packets"
#define XSTAT_TX_65_TO_127_PACKETS "tx_size_65_to_127_packets"
#define XSTAT_TX_128_TO_255_PACKETS "tx_size_128_to_255_packets"
#define XSTAT_TX_256_TO_511_PACKETS "tx_size_256_to_511_packets"
#define XSTAT_TX_512_TO_1023_PACKETS "tx_size_512_to_1023_packets"
#define XSTAT_TX_1024_TO_1522_PACKETS "tx_size_1024_to_1522_packets"
#define XSTAT_TX_1523_TO_MAX_PACKETS "tx_size_1523_to_max_packets"
#define XSTAT_RX_MULTICAST_PACKETS "rx_multicast_packets"
#define XSTAT_TX_MULTICAST_PACKETS "tx_multicast_packets"
#define XSTAT_RX_BROADCAST_PACKETS "rx_broadcast_packets"
#define XSTAT_TX_BROADCAST_PACKETS "tx_broadcast_packets"
#define XSTAT_RX_UNDERSIZED_ERRORS "rx_undersized_errors"
#define XSTAT_RX_OVERSIZE_ERRORS "rx_oversize_errors"
#define XSTAT_RX_FRAGMENTED_ERRORS "rx_fragmented_errors"
#define XSTAT_RX_JABBER_ERRORS "rx_jabber_errors"
#define SOCKET0 0
/* Default size of Physical NIC RXQ */
#define NIC_PORT_DEFAULT_RXQ_SIZE 2048
/* Default size of Physical NIC TXQ */
#define NIC_PORT_DEFAULT_TXQ_SIZE 2048
/* Maximum size of Physical NIC Queues */
#define NIC_PORT_MAX_Q_SIZE 4096
#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
#define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
* yet mapped to another queue. */
#define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
/* DPDK library uses uint16_t for port_id. */
typedef uint16_t dpdk_port_t;
#define DPDK_PORT_ID_FMT "%"PRIu16
#define VHOST_ENQ_RETRY_NUM 8
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
static const struct rte_eth_conf port_conf = {
.rxmode = {
.mq_mode = ETH_MQ_RX_RSS,
.split_hdr_size = 0,
.header_split = 0, /* Header Split disabled */
.hw_ip_checksum = 0, /* IP checksum offload disabled */
.hw_vlan_filter = 0, /* VLAN filtering disabled */
.jumbo_frame = 0, /* Jumbo Frame Support disabled */
.hw_strip_crc = 0,
},
.rx_adv_conf = {
.rss_conf = {
.rss_key = NULL,
.rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
},
},
.txmode = {
.mq_mode = ETH_MQ_TX_NONE,
},
};
/*
* A mapping from ufid to dpdk rte_flow.
*/
static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
struct ufid_to_rte_flow_data {
struct cmap_node node;
ovs_u128 ufid;
struct rte_flow *rte_flow;
};
/*
* These callbacks allow virtio-net devices to be added to vhost ports when
* configuration has been fully completed.
*/
static int new_device(int vid);
static void destroy_device(int vid);
static int vring_state_changed(int vid, uint16_t queue_id, int enable);
static const struct vhost_device_ops virtio_net_device_ops =
{
.new_device = new_device,
.destroy_device = destroy_device,
.vring_state_changed = vring_state_changed,
.features_changed = NULL
};
enum { DPDK_RING_SIZE = 256 };
BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
enum { DRAIN_TSC = 200000ULL };
enum dpdk_dev_type {
DPDK_DEV_ETH = 0,
DPDK_DEV_VHOST = 1,
};
/* Quality of Service */
/* An instance of a QoS configuration. Always associated with a particular
* network device.
*
* Each QoS implementation subclasses this with whatever additional data it
* needs.
*/
struct qos_conf {
const struct dpdk_qos_ops *ops;
rte_spinlock_t lock;
};
/* A particular implementation of dpdk QoS operations.
*
* The functions below return 0 if successful or a positive errno value on
* failure, except where otherwise noted. All of them must be provided, except
* where otherwise noted.
*/
struct dpdk_qos_ops {
/* Name of the QoS type */
const char *qos_name;
/* Called to construct a qos_conf object. The implementation should make
* the appropriate calls to configure QoS according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* This function must return 0 if and only if it sets '*conf' to an
* initialized 'struct qos_conf'.
*
* For all QoS implementations it should always be non-null.
*/
int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
/* Destroys the data structures allocated by the implementation as part of
* 'qos_conf'.
*
* For all QoS implementations it should always be non-null.
*/
void (*qos_destruct)(struct qos_conf *conf);
/* Retrieves details of 'conf' configuration into 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*/
int (*qos_get)(const struct qos_conf *conf, struct smap *details);
/* Returns true if 'conf' is already configured according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* For all QoS implementations it should always be non-null.
*/
bool (*qos_is_equal)(const struct qos_conf *conf,
const struct smap *details);
/* Modify an array of rte_mbufs. The modification is specific to
* each qos implementation.
*
* The function should take and array of mbufs and an int representing
* the current number of mbufs present in the array.
*
* After the function has performed a qos modification to the array of
* mbufs it returns an int representing the number of mbufs now present in
* the array. This value is can then be passed to the port send function
* along with the modified array for transmission.
*
* For all QoS implementations it should always be non-null.
*/
int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
int pkt_cnt, bool should_steal);
};
/* dpdk_qos_ops for each type of user space QoS implementation */
static const struct dpdk_qos_ops egress_policer_ops;
/*
* Array of dpdk_qos_ops, contains pointer to all supported QoS
* operations.
*/
static const struct dpdk_qos_ops *const qos_confs[] = {
&egress_policer_ops,
NULL
};
static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
/* Contains all 'struct dpdk_dev's. */
static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
= OVS_LIST_INITIALIZER(&dpdk_list);
static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
= OVS_MUTEX_INITIALIZER;
/* Contains all 'struct dpdk_mp's. */
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
= OVS_LIST_INITIALIZER(&dpdk_mp_list);
struct dpdk_mp {
struct rte_mempool *mp;
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
int mtu;
int socket_id;
int refcount;
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
};
/* There should be one 'struct dpdk_tx_queue' created for
* each cpu core. */
struct dpdk_tx_queue {
rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
* from concurrent access. It is used only
* if the queue is shared among different
dpif-netdev: XPS (Transmit Packet Steering) implementation. If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
2016-07-27 17:44:41 +03:00
* pmd threads (see 'concurrent_txq'). */
int map; /* Mapping of configured vhost-user queues
* to enabled by guest. */
};
/* dpdk has no way to remove dpdk ring ethernet devices
so we have to keep them around once they've been created
*/
static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
= OVS_LIST_INITIALIZER(&dpdk_ring_list);
struct dpdk_ring {
/* For the client rings */
struct rte_ring *cring_tx;
struct rte_ring *cring_rx;
unsigned int user_port_id; /* User given port no, parsed from port name */
dpdk_port_t eth_port_id; /* ethernet device port id */
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
};
struct ingress_policer {
struct rte_meter_srtcm_params app_srtcm_params;
struct rte_meter_srtcm in_policer;
rte_spinlock_t policer_lock;
};
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
enum dpdk_hw_ol_features {
NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
NETDEV_RX_HW_CRC_STRIP = 1 << 1,
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
};
/*
* In order to avoid confusion in variables names, following naming convention
* should be used, if possible:
*
* 'struct netdev' : 'netdev'
* 'struct netdev_dpdk' : 'dev'
* 'struct netdev_rxq' : 'rxq'
* 'struct netdev_rxq_dpdk' : 'rx'
*
* Example:
* struct netdev *netdev = netdev_from_name(name);
* struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
*
* Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
* already defined.
*/
struct netdev_dpdk {
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
dpdk_port_t port_id;
/* If true, device was attached by rte_eth_dev_attach(). */
bool attached;
/* If true, rte_eth_dev_start() was successfully called */
bool started;
struct eth_addr hwaddr;
int mtu;
int socket_id;
int buf_size;
int max_packet_len;
enum dpdk_dev_type type;
enum netdev_flags flags;
netdev-dpdk: Fixed netdev_dpdk structure alignment Currently, the code tells us we have 4 pad bytes left in cacheline0 while actually we are 8 bytes short: struct netdev_dpdk { union { OVS_CACHE_LINE_MARKER cacheline0; /* 1 */ struct { dpdk_port_t port_id; /* 0 2 */ _Bool attached; /* 2 1 */ struct eth_addr hwaddr; /* 4 6 */ int mtu; /* 12 4 */ int socket_id; /* 16 4 */ int buf_size; /* 20 4 */ int max_packet_len; /* 24 4 */ enum dpdk_dev_type type; /* 28 4 */ enum netdev_flags flags; /* 32 4 */ char * devargs; /* 40 8 */ struct dpdk_tx_queue * tx_q; /* 48 8 */ struct rte_eth_link link; /* 56 8 */ int link_reset_cnt; /* 64 4 */ }; /* 72 */ uint8_t pad9[128]; /* 128 */ }; /* 0 128 */ /* --- cacheline 2 boundary (128 bytes) --- */ Re-located one member, link_reset_cnt, and now it's one cache line: struct netdev_dpdk { union { OVS_CACHE_LINE_MARKER cacheline0; /* 1 */ struct { dpdk_port_t port_id; /* 0 2 */ _Bool attached; /* 2 1 */ struct eth_addr hwaddr; /* 4 6 */ int mtu; /* 12 4 */ int socket_id; /* 16 4 */ int buf_size; /* 20 4 */ int max_packet_len; /* 24 4 */ enum dpdk_dev_type type; /* 28 4 */ enum netdev_flags flags; /* 32 4 */ int link_reset_cnt; /* 36 4 */ char * devargs; /* 40 8 */ struct dpdk_tx_queue * tx_q; /* 48 8 */ struct rte_eth_link link; /* 56 8 */ }; /* 64 */ uint8_t pad9[64]; /* 64 */ }; /* 0 64 */ /* --- cacheline 1 boundary (64 bytes) --- */ Fixes: 5e925ccc2a6f ("netdev-dpdk: DPDK v17.11 upgrade") Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-04-25 17:48:23 +02:00
int link_reset_cnt;
char *devargs; /* Device arguments for dpdk ports */
struct dpdk_tx_queue *tx_q;
struct rte_eth_link link;
);
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
struct dpdk_mp *dpdk_mp;
/* virtio identifier for vhost devices */
ovsrcu_index vid;
/* True if vHost device is 'up' and has been reconfigured at least once */
bool vhost_reconfigured;
/* 3 pad bytes here. */
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* Identifier used to distinguish vhost devices from each other. */
char vhost_id[PATH_MAX];
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
struct netdev up;
/* In dpdk_list. */
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
/* QoS configuration and lock for the device */
OVSRCU_TYPE(struct qos_conf *) qos_conf;
/* Ingress Policer */
OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
uint32_t policer_rate;
uint32_t policer_burst;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
struct netdev_stats stats;
/* Protects stats */
rte_spinlock_t stats_lock;
/* 44 pad bytes here. */
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* The following properties cannot be changed when a device is running,
* so we remember the request and update them next time
* netdev_dpdk*_reconfigure() is called */
int requested_mtu;
int requested_n_txq;
int requested_n_rxq;
int requested_rxq_size;
int requested_txq_size;
/* Number of rx/tx descriptors for physical devices */
int rxq_size;
int txq_size;
/* Socket ID detected when vHost device is brought up */
int requested_socket_id;
/* Denotes whether vHost port is client/server mode */
uint64_t vhost_driver_flags;
/* DPDK-ETH Flow control */
struct rte_eth_fc_conf fc_conf;
/* DPDK-ETH hardware offload features,
* from the enum set 'dpdk_hw_ol_features' */
uint32_t hw_ol_features;
/* Properties for link state change detection mode.
* If lsc_interrupt_mode is set to false, poll mode is used,
* otherwise interrupt mode is used. */
bool requested_lsc_interrupt_mode;
bool lsc_interrupt_mode;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* Names of all XSTATS counters */
struct rte_eth_xstat_name *rte_xstats_names;
int rte_xstats_names_size;
int rte_xstats_ids_size;
uint64_t *rte_xstats_ids;
);
};
struct netdev_rxq_dpdk {
struct netdev_rxq up;
dpdk_port_t port_id;
};
static void netdev_dpdk_destruct(struct netdev *netdev);
static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
static bool
is_dpdk_class(const struct netdev_class *class)
{
return class->destruct == netdev_dpdk_destruct
|| class->destruct == netdev_dpdk_vhost_destruct;
}
/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
* aligned at 1k or less. If a declared mbuf size is not a multiple of this
* value, insufficient buffers are allocated to accomodate the packet in its
* entirety. Furthermore, certain drivers need to ensure that there is also
* sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
* frames). If the RX buffer is too small, then the driver enables scatter RX
* behaviour, which reduces performance. To prevent this, use a buffer size
* that is closest to 'mtu', but which satisfies the aforementioned criteria.
*/
static uint32_t
dpdk_buf_size(int mtu)
{
return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM),
NETDEV_DPDK_MBUF_ALIGN);
}
/* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
*
* Unlike xmalloc(), this function can return NULL on failure. */
static void *
dpdk_rte_mzalloc(size_t sz)
{
return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
}
void
free_dpdk_buf(struct dp_packet *p)
{
netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads DPDK mempools rely on rte_lcore_id() to implement a thread-local cache. Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to the "thread-local" cache, causing crashes. This commit resolves the issue with the following changes: - Every non pmd thread has the same lcore_id (0, for management reasons), which is not shared with any pmd thread (lcore_id for pmd threads now start from 1) - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be held. - The previous change does not allow us anymore to pass DPDK mbufs to handler threads: therefore this commit partially revert 143859ec63d45e. Now packets are copied for upcall processing. We can remove the extra memcpy by processing upcalls in the pmd thread itself. With the introduction of the extra locking, the packet throughput will be lower in the following cases: - When using internal (tap) devices with DPDK devices on the same datapath. Anyway, to support internal devices efficiently, we needed DPDK KNI devices, which will be proper pmd devices and will not need this locking. - When packets are processed in the slow path by non pmd threads. This overhead can be avoided by handling the upcalls directly in pmd threads (a change that has already been proposed by Ryan Wilson) Also, the following two fixes have been introduced: - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put(). This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option - Do not bulk free mbufs in a transmission queue. They may belong to different mempools Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2014-07-17 14:29:36 -07:00
struct rte_mbuf *pkt = (struct rte_mbuf *) p;
rte_pktmbuf_free(pkt);
}
static void
ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
void *opaque_arg OVS_UNUSED,
void *_p,
unsigned i OVS_UNUSED)
{
struct rte_mbuf *pkt = _p;
dp_packet_init_dpdk((struct dp_packet *) pkt, pkt->buf_len);
}
static int
dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
{
/* At this point we want to know if all the mbufs are back
* in the mempool. rte_mempool_full() is not atomic but it's
* the best available and as we are no longer requesting mbufs
* from the mempool, it means mbufs will not move from
* 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
* the ring is counted before caches, so we won't get false
* positives in this use case and we handle false negatives.
*
* If future implementations of rte_mempool_full() were to change
* it could be possible for a false positive. Even that would
* likely be ok, as there are additional checks during mempool
* freeing but it would make things racey.
*/
return rte_mempool_full(mp);
}
/* Free unused mempools. */
static void
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
{
struct dpdk_mp *dmp, *next;
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_list) {
if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
ovs_list_remove(&dmp->list_node);
rte_mempool_free(dmp->mp);
rte_free(dmp);
}
}
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Calculating the required number of mbufs differs depending on the
* mempool model being used. Check if per port memory is in use before
* calculating.
*/
static uint32_t
dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
{
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
uint32_t n_mbufs;
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
if (!per_port_mp) {
/* Shared memory are being used.
* XXX: this is a really rough method of provisioning memory.
* It's impossible to determine what the exact memory requirements are
* when the number of ports and rxqs that utilize a particular mempool
* can change dynamically at runtime. For now, use this rough
* heurisitic.
*/
if (mtu >= ETHER_MTU) {
n_mbufs = MAX_NB_MBUF;
} else {
n_mbufs = MIN_NB_MBUF;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
} else {
/* Per port memory is being used.
* XXX: rough estimation of number of mbufs required for this port:
* <packets required to fill the device rxqs>
* + <packets that could be stuck on other ports txqs>
* + <packets in the pmd threads>
* + <additional memory for corner cases>
*/
n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
+ dev->requested_n_txq * dev->requested_txq_size
+ MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
+ MIN_NB_MBUF;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
return n_mbufs;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
static struct dpdk_mp *
dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
{
char mp_name[RTE_MEMPOOL_NAMESIZE];
const char *netdev_name = netdev_get_name(&dev->up);
int socket_id = dev->requested_socket_id;
uint32_t n_mbufs;
uint32_t hash = hash_string(netdev_name, 0);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
struct dpdk_mp *dmp = NULL;
int ret;
dmp = dpdk_rte_mzalloc(sizeof *dmp);
if (!dmp) {
return NULL;
}
dmp->socket_id = socket_id;
dmp->mtu = mtu;
dmp->refcount = 1;
n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp);
do {
/* Full DPDK memory pool name must be unique and cannot be
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
* longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
* mempool case this can result in one device using a mempool
* which references a different device in it's name. However as
* mempool names are hashed, the device name will not be readable
* so this is not an issue for tasks such as debugging.
*/
ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
netdev-dpdk: Fix mempool creation with large MTU. Currently mempool name size limited to 25 characters by RTE_MEMPOOL_NAMESIZE. netdev-dpdk tries to create mempool with the following name pattern: "ovs_%{hash}_%{socket}_%{mtu}_%{n_mbuf}". We have 3 chars for "ovs" + 4 chars for delimiters + 8 chars for hash (because it's the 32 bit integer printed in hex) + 1 char for socket_id (mostly 1, but it could be 2 on some systems; larger?) = 16. Only 25 - 16 = 9 characters remains for mtu + n_mbufs. Minimum usual value for mtu is 1500 --> 2030 (4 chars) after dpdk_buf_size conversion and the minimum value for n_mbufs is 16384 (5 chars). So, all the 9 characters are used. If we'll try to create port with mtu = 9500, mempool creation will fail, because FRAME_LEN_TO_MTU(dpdk_buf_size(9500)) = 10222 (5 chars) and this value will overflow the RTE_MEMPOOL_NAMESIZE limit. Same issue will happen if we'll try to create port with big enough number of queues or will try to create big enough number of PMD threads (number of tx queues will enlarge the mempool requirements). Fix that by removing the delimiters. To keep the readability (at least partial) of the mempool names exact field sizes with zero padding are used. Following limits should be suitable for now: - Hash length: 8 chars (uint32_t in hex) - Socket ID : 2 chars (For systems with up to 10 sockets) - MTU : 5 chars (MTU (10^5 - 1) should be enough for now) - n_mbufs : 7 chars (Up to 10^7 of mbufs) Total : 22 + 3 (for "ovs") = 25 CC: Antonio Fischetti <antonio.fischetti@intel.com> CC: Robert Wojciechowicz <robertx.wojciechowicz@intel.com> Fixes: f06546a51dd8 ("Fix mempool names to reflect socket id.") Fixes: d555d9bded5f ("netdev-dpdk: Create separate memory pool for each port.") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Acked-by: Antonio Fischetti <antonio.fischetti@intel.com> Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-11-10 10:12:06 +03:00
"ovs%08x%02d%05d%07u",
hash, socket_id, mtu, n_mbufs);
if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
VLOG_DBG("snprintf returned %d. "
"Failed to generate a mempool name for \"%s\". "
"Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
ret, netdev_name, hash, socket_id, mtu, n_mbufs);
break;
}
VLOG_DBG("Port %s: Requesting a mempool of %u mbufs "
"on socket %d for %d Rx and %d Tx queues.",
netdev_name, n_mbufs, socket_id,
dev->requested_n_rxq, dev->requested_n_txq);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs,
MP_CACHE_SZ,
sizeof (struct dp_packet)
- sizeof (struct rte_mbuf),
MBUF_SIZE(mtu)
- sizeof(struct dp_packet),
socket_id);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
if (dmp->mp) {
VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
mp_name, n_mbufs);
/* rte_pktmbuf_pool_create has done some initialization of the
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
* rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
* initializes some OVS specific fields of dp_packet.
*/
rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
return dmp;
} else if (rte_errno == EEXIST) {
/* A mempool with the same name already exists. We just
* retrieve its pointer to be returned to the caller. */
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dmp->mp = rte_mempool_lookup(mp_name);
/* As the mempool create returned EEXIST we can expect the
* lookup has returned a valid pointer. If for some reason
* that's not the case we keep track of it. */
VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
mp_name, dmp->mp);
return dmp;
} else {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
VLOG_DBG("Failed to create mempool \"%s\" with a request of "
"%u mbufs, retrying with %u mbufs",
mp_name, n_mbufs, n_mbufs / 2);
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
} while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
mp_name, n_mbufs);
rte_free(dmp);
return NULL;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
static struct dpdk_mp *
dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
{
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
struct dpdk_mp *dmp, *next;
bool reuse = false;
ovs_mutex_lock(&dpdk_mp_mutex);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Check if shared memory is being used, if so check existing mempools
* to see if reuse is possible. */
if (!per_port_mp) {
LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
if (dmp->socket_id == dev->requested_socket_id
&& dmp->mtu == mtu) {
VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
dmp->refcount++;
reuse = true;
break;
}
}
}
/* Sweep mempools after reuse or before create. */
dpdk_mp_sweep();
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
if (!reuse) {
dmp = dpdk_mp_create(dev, mtu, per_port_mp);
if (dmp) {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Shared memory will hit the reuse case above so will not
* request a mempool that already exists but we need to check
* for the EEXIST case for per port memory case. Compare the
* mempool returned by dmp to each entry in dpdk_mp_list. If a
* match is found, free dmp as a new entry is not required, set
* dmp to point to the existing entry and increment the refcount
* to avoid being freed at a later stage.
*/
if (per_port_mp && rte_errno == EEXIST) {
LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
if (dmp->mp == next->mp) {
rte_free(dmp);
dmp = next;
dmp->refcount++;
}
}
} else {
ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
}
}
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
ovs_mutex_unlock(&dpdk_mp_mutex);
return dmp;
}
/* Decrement reference to a mempool. */
static void
dpdk_mp_put(struct dpdk_mp *dmp)
{
if (!dmp) {
return;
}
ovs_mutex_lock(&dpdk_mp_mutex);
ovs_assert(dmp->refcount);
dmp->refcount--;
ovs_mutex_unlock(&dpdk_mp_mutex);
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Depending on the memory model being used this function tries to
* identify and reuse an existing mempool or tries to allocate a new
* mempool on requested_socket_id with mbuf size corresponding to the
* requested_mtu. On success, a new configuration will be applied.
* On error, device will be left unchanged. */
static int
netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
struct dpdk_mp *dmp;
int ret = 0;
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
bool per_port_mp = dpdk_per_port_memory();
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* With shared memory we do not need to configure a mempool if the MTU
* and socket ID have not changed, the previous configuration is still
* valid so return 0 */
if (!per_port_mp && dev->mtu == dev->requested_mtu
&& dev->socket_id == dev->requested_socket_id) {
return ret;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp);
if (!dmp) {
VLOG_ERR("Failed to create memory pool for netdev "
"%s, with MTU %d on socket %d: %s\n",
dev->up.name, dev->requested_mtu, dev->requested_socket_id,
rte_strerror(rte_errno));
ret = rte_errno;
} else {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* Check for any pre-existing dpdk_mp for the device before accessing
* the associated mempool.
*/
if (dev->dpdk_mp != NULL) {
/* A new MTU was requested, decrement the reference count for the
* devices current dpdk_mp. This is required even if a pointer to
* same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
* has already been incremented by dpdk_mp_get at this stage so it
* must be decremented to keep an accurate refcount for the
* dpdk_mp.
*/
dpdk_mp_put(dev->dpdk_mp);
}
dev->dpdk_mp = dmp;
dev->mtu = dev->requested_mtu;
dev->socket_id = dev->requested_socket_id;
dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
}
return ret;
}
static void
check_link_status(struct netdev_dpdk *dev)
{
struct rte_eth_link link;
rte_eth_link_get_nowait(dev->port_id, &link);
if (dev->link.link_status != link.link_status) {
netdev_change_seq_changed(&dev->up);
dev->link_reset_cnt++;
dev->link = link;
if (dev->link.link_status) {
VLOG_DBG_RL(&rl,
"Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
dev->port_id, (unsigned) dev->link.link_speed,
(dev->link.link_duplex == ETH_LINK_FULL_DUPLEX)
? "full-duplex" : "half-duplex");
} else {
VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
dev->port_id);
}
}
}
static void *
dpdk_watchdog(void *dummy OVS_UNUSED)
{
struct netdev_dpdk *dev;
pthread_detach(pthread_self());
for (;;) {
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (dev->type == DPDK_DEV_ETH) {
check_link_status(dev);
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
}
return NULL;
}
static int
dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
{
int diag = 0;
int i;
struct rte_eth_conf conf = port_conf;
struct rte_eth_dev_info info;
uint16_t conf_mtu;
/* As of DPDK 17.11.1 a few PMDs require to explicitly enable
* scatter to support jumbo RX. Checking the offload capabilities
* is not an option as PMDs are not required yet to report
* them. The only reliable info is the driver name and knowledge
* (testing or code review). Listing all such PMDs feels harder
* than highlighting the one known not to need scatter */
if (dev->mtu > ETHER_MTU) {
rte_eth_dev_info_get(dev->port_id, &info);
if (strncmp(info.driver_name, "net_nfp", 7)) {
conf.rxmode.enable_scatter = 1;
}
}
conf.intr_conf.lsc = dev->lsc_interrupt_mode;
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
conf.rxmode.hw_ip_checksum = (dev->hw_ol_features &
NETDEV_RX_CHECKSUM_OFFLOAD) != 0;
if (dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP) {
conf.rxmode.hw_strip_crc = 1;
}
/* A device may report more queues than it makes available (this has
* been observed for Intel xl710, which reserves some of them for
* SRIOV): rte_eth_*_queue_setup will fail if a queue is not
* available. When this happens we can retry the configuration
* and request less queues */
while (n_rxq && n_txq) {
if (diag) {
VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
}
diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
if (diag) {
VLOG_WARN("Interface %s eth_dev setup error %s\n",
dev->up.name, rte_strerror(-diag));
break;
}
diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
if (diag) {
/* A device may not support rte_eth_dev_set_mtu, in this case
* flag a warning to the user and include the devices configured
* MTU value that will be used instead. */
if (-ENOTSUP == diag) {
rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
VLOG_WARN("Interface %s does not support MTU configuration, "
"max packet size supported is %"PRIu16".",
dev->up.name, conf_mtu);
} else {
VLOG_ERR("Interface %s MTU (%d) setup error: %s",
dev->up.name, dev->mtu, rte_strerror(-diag));
break;
}
}
for (i = 0; i < n_txq; i++) {
diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
dev->socket_id, NULL);
if (diag) {
VLOG_INFO("Interface %s unable to setup txq(%d): %s",
dev->up.name, i, rte_strerror(-diag));
break;
}
}
if (i != n_txq) {
/* Retry with less tx queues */
n_txq = i;
continue;
}
for (i = 0; i < n_rxq; i++) {
diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dev->socket_id, NULL,
dev->dpdk_mp->mp);
if (diag) {
VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
dev->up.name, i, rte_strerror(-diag));
break;
}
}
if (i != n_rxq) {
/* Retry with less rx queues */
n_rxq = i;
continue;
}
dev->up.n_rxq = n_rxq;
dev->up.n_txq = n_txq;
return 0;
}
return diag;
}
static void
dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
{
if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
dev->port_id);
}
}
static int
dpdk_eth_dev_init(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
struct rte_pktmbuf_pool_private *mbp_priv;
struct rte_eth_dev_info info;
struct ether_addr eth_addr;
int diag;
int n_rxq, n_txq;
uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
DEV_RX_OFFLOAD_TCP_CKSUM |
DEV_RX_OFFLOAD_IPV4_CKSUM;
rte_eth_dev_info_get(dev->port_id, &info);
if (strstr(info.driver_name, "vf") != NULL) {
VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
} else {
dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
}
if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
rx_chksm_offload_capa) {
VLOG_WARN("Rx checksum offload is not supported on port "
DPDK_PORT_ID_FMT, dev->port_id);
dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
} else {
dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
}
n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
if (diag) {
VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
"configure error: %s",
dev->up.name, n_rxq, n_txq,
dev->lsc_interrupt_mode ? "true" : "false",
rte_strerror(-diag));
return -diag;
}
diag = rte_eth_dev_start(dev->port_id);
if (diag) {
VLOG_ERR("Interface %s start error: %s", dev->up.name,
rte_strerror(-diag));
return -diag;
}
dev->started = true;
rte_eth_promiscuous_enable(dev->port_id);
rte_eth_allmulticast_enable(dev->port_id);
memset(&eth_addr, 0x0, sizeof(eth_addr));
rte_eth_macaddr_get(dev->port_id, &eth_addr);
VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
rte_eth_link_get_nowait(dev->port_id, &dev->link);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
/* Get the Flow control configuration for DPDK-ETH */
diag = rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
if (diag) {
VLOG_DBG("cannot get flow control parameters on port "DPDK_PORT_ID_FMT
", err=%d", dev->port_id, diag);
}
return 0;
}
static struct netdev_dpdk *
netdev_dpdk_cast(const struct netdev *netdev)
{
return CONTAINER_OF(netdev, struct netdev_dpdk, up);
}
static struct netdev *
netdev_dpdk_alloc(void)
{
struct netdev_dpdk *dev;
dev = dpdk_rte_mzalloc(sizeof *dev);
if (dev) {
return &dev->up;
}
return NULL;
}
static struct dpdk_tx_queue *
netdev_dpdk_alloc_txq(unsigned int n_txqs)
{
struct dpdk_tx_queue *txqs;
unsigned i;
txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
if (txqs) {
for (i = 0; i < n_txqs; i++) {
/* Initialize map for vhost devices. */
txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
rte_spinlock_init(&txqs[i].tx_lock);
}
}
return txqs;
}
static int
common_construct(struct netdev *netdev, dpdk_port_t port_no,
enum dpdk_dev_type type, int socket_id)
OVS_REQUIRES(dpdk_mutex)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_init(&dev->mutex);
rte_spinlock_init(&dev->stats_lock);
/* If the 'sid' is negative, it means that the kernel fails
* to obtain the pci numa info. In that situation, always
* use 'SOCKET0'. */
dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
dev->requested_socket_id = dev->socket_id;
dev->port_id = port_no;
dev->type = type;
dev->flags = 0;
dev->requested_mtu = ETHER_MTU;
dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
dev->requested_lsc_interrupt_mode = 0;
ovsrcu_index_init(&dev->vid, -1);
dev->vhost_reconfigured = false;
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
dev->attached = false;
ovsrcu_init(&dev->qos_conf, NULL);
ovsrcu_init(&dev->ingress_policer, NULL);
dev->policer_rate = 0;
dev->policer_burst = 0;
netdev->n_rxq = 0;
netdev->n_txq = 0;
dev->requested_n_rxq = NR_QUEUE;
dev->requested_n_txq = NR_QUEUE;
dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
/* Initialize the flow control to NULL */
memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
/* Initilize the hardware offload flags to 0 */
dev->hw_ol_features = 0;
dev->flags = NETDEV_UP | NETDEV_PROMISC;
ovs_list_push_back(&dpdk_list, &dev->list_node);
netdev_request_reconfigure(netdev);
dev->rte_xstats_names = NULL;
dev->rte_xstats_names_size = 0;
dev->rte_xstats_ids = NULL;
dev->rte_xstats_ids_size = 0;
return 0;
}
/* dev_name must be the prefix followed by a positive decimal number.
* (no leading + or - signs are allowed) */
static int
dpdk_dev_parse_name(const char dev_name[], const char prefix[],
unsigned int *port_no)
{
const char *cport;
if (strncmp(dev_name, prefix, strlen(prefix))) {
return ENODEV;
}
cport = dev_name + strlen(prefix);
if (str_to_uint(cport, 10, port_no)) {
return 0;
} else {
return ENODEV;
}
}
static int
vhost_common_construct(struct netdev *netdev)
OVS_REQUIRES(dpdk_mutex)
{
int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
if (!dev->tx_q) {
return ENOMEM;
}
return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
DPDK_DEV_VHOST, socket_id);
}
static int
netdev_dpdk_vhost_construct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const char *name = netdev->name;
int err;
/* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
* the file system. '/' or '\' would traverse directories, so they're not
* acceptable in 'name'. */
if (strchr(name, '/') || strchr(name, '\\')) {
VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
"A valid name must not include '/' or '\\'",
name);
return EINVAL;
}
ovs_mutex_lock(&dpdk_mutex);
/* Take the name of the vhost-user port and append it to the location where
* the socket is to be created, then register the socket.
*/
snprintf(dev->vhost_id, sizeof dev->vhost_id, "%s/%s",
dpdk_get_vhost_sock_dir(), name);
dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
if (err) {
VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
dev->vhost_id);
goto out;
} else {
fatal_signal_add_file_to_unlink(dev->vhost_id);
VLOG_INFO("Socket %s created for vhost-user port %s\n",
dev->vhost_id, name);
}
err = rte_vhost_driver_callback_register(dev->vhost_id,
&virtio_net_device_ops);
if (err) {
VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
"port: %s\n", name);
goto out;
}
err = rte_vhost_driver_disable_features(dev->vhost_id,
1ULL << VIRTIO_NET_F_HOST_TSO4
| 1ULL << VIRTIO_NET_F_HOST_TSO6
| 1ULL << VIRTIO_NET_F_CSUM);
if (err) {
VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
"port: %s\n", name);
goto out;
}
err = rte_vhost_driver_start(dev->vhost_id);
if (err) {
VLOG_ERR("rte_vhost_driver_start failed for vhost user "
"port: %s\n", name);
goto out;
}
err = vhost_common_construct(netdev);
if (err) {
VLOG_ERR("vhost_common_construct failed for vhost user "
"port: %s\n", name);
}
out:
ovs_mutex_unlock(&dpdk_mutex);
VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
"please migrate to dpdkvhostuserclient ports.");
return err;
}
static int
netdev_dpdk_vhost_client_construct(struct netdev *netdev)
{
int err;
ovs_mutex_lock(&dpdk_mutex);
err = vhost_common_construct(netdev);
if (err) {
VLOG_ERR("vhost_common_construct failed for vhost user client"
"port: %s\n", netdev->name);
}
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static int
netdev_dpdk_construct(struct netdev *netdev)
{
int err;
ovs_mutex_lock(&dpdk_mutex);
err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
DPDK_DEV_ETH, SOCKET0);
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static void
common_destruct(struct netdev_dpdk *dev)
OVS_REQUIRES(dpdk_mutex)
OVS_EXCLUDED(dev->mutex)
{
rte_free(dev->tx_q);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
dpdk_mp_put(dev->dpdk_mp);
ovs_list_remove(&dev->list_node);
free(ovsrcu_get_protected(struct ingress_policer *,
&dev->ingress_policer));
ovs_mutex_destroy(&dev->mutex);
}
static void
netdev_dpdk_destruct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
char devname[RTE_ETH_NAME_MAX_LEN];
ovs_mutex_lock(&dpdk_mutex);
rte_eth_dev_stop(dev->port_id);
dev->started = false;
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
if (dev->attached) {
rte_eth_dev_close(dev->port_id);
if (rte_eth_dev_detach(dev->port_id, devname) < 0) {
VLOG_ERR("Device '%s' can not be detached", dev->devargs);
} else {
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
VLOG_INFO("Device '%s' has been detached", devname);
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
}
}
netdev_dpdk_clear_xstats(dev);
free(dev->devargs);
common_destruct(dev);
ovs_mutex_unlock(&dpdk_mutex);
}
/* rte_vhost_driver_unregister() can call back destroy_device(), which will
* try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
* deadlock, none of the mutexes must be held while calling this function. */
static int
dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
char *vhost_id)
OVS_EXCLUDED(dpdk_mutex)
OVS_EXCLUDED(dev->mutex)
{
return rte_vhost_driver_unregister(vhost_id);
}
static void
netdev_dpdk_vhost_destruct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
char *vhost_id;
ovs_mutex_lock(&dpdk_mutex);
/* Guest becomes an orphan if still attached. */
if (netdev_dpdk_get_vid(dev) >= 0
&& !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
VLOG_ERR("Removing port '%s' while vhost device still attached.",
netdev->name);
VLOG_ERR("To restore connectivity after re-adding of port, VM on "
"socket '%s' must be restarted.", dev->vhost_id);
}
vhost_id = xstrdup(dev->vhost_id);
common_destruct(dev);
ovs_mutex_unlock(&dpdk_mutex);
if (!vhost_id[0]) {
goto out;
}
if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
netdev->name, vhost_id);
} else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
/* OVS server mode - remove this socket from list for deletion */
fatal_signal_remove_file_to_unlink(vhost_id);
}
out:
free(vhost_id);
}
static void
netdev_dpdk_dealloc(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
rte_free(dev);
}
static void
netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
{
/* If statistics are already allocated, we have to
* reconfigure, as port_id could have been changed. */
if (dev->rte_xstats_names) {
free(dev->rte_xstats_names);
dev->rte_xstats_names = NULL;
dev->rte_xstats_names_size = 0;
}
if (dev->rte_xstats_ids) {
free(dev->rte_xstats_ids);
dev->rte_xstats_ids = NULL;
dev->rte_xstats_ids_size = 0;
}
}
static const char*
netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
{
if (id >= dev->rte_xstats_names_size) {
return "UNKNOWN";
}
return dev->rte_xstats_names[id].name;
}
static bool
netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
int rte_xstats_len;
bool ret;
struct rte_eth_xstat *rte_xstats;
uint64_t id;
int xstats_no;
const char *name;
/* Retrieving all XSTATS names. If something will go wrong
* or amount of counters will be equal 0, rte_xstats_names
* buffer will be marked as NULL, and any further xstats
* query won't be performed (e.g. during netdev_dpdk_get_stats
* execution). */
ret = false;
rte_xstats = NULL;
if (dev->rte_xstats_names == NULL || dev->rte_xstats_ids == NULL) {
dev->rte_xstats_names_size =
rte_eth_xstats_get_names(dev->port_id, NULL, 0);
if (dev->rte_xstats_names_size < 0) {
VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
dev->port_id);
dev->rte_xstats_names_size = 0;
} else {
/* Reserve memory for xstats names and values */
dev->rte_xstats_names = xcalloc(dev->rte_xstats_names_size,
sizeof *dev->rte_xstats_names);
if (dev->rte_xstats_names) {
/* Retreive xstats names */
rte_xstats_len =
rte_eth_xstats_get_names(dev->port_id,
dev->rte_xstats_names,
dev->rte_xstats_names_size);
if (rte_xstats_len < 0) {
VLOG_WARN("Cannot get XSTATS names for port: "
DPDK_PORT_ID_FMT, dev->port_id);
goto out;
} else if (rte_xstats_len != dev->rte_xstats_names_size) {
VLOG_WARN("XSTATS size doesn't match for port: "
DPDK_PORT_ID_FMT, dev->port_id);
goto out;
}
dev->rte_xstats_ids = xcalloc(dev->rte_xstats_names_size,
sizeof(uint64_t));
/* We have to calculate number of counters */
rte_xstats = xmalloc(rte_xstats_len * sizeof *rte_xstats);
memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
/* Retreive xstats values */
if (rte_eth_xstats_get(dev->port_id, rte_xstats,
rte_xstats_len) > 0) {
dev->rte_xstats_ids_size = 0;
xstats_no = 0;
for (uint32_t i = 0; i < rte_xstats_len; i++) {
id = rte_xstats[i].id;
name = netdev_dpdk_get_xstat_name(dev, id);
/* We need to filter out everything except
* dropped, error and management counters */
if (string_ends_with(name, "_errors") ||
strstr(name, "_management_") ||
string_ends_with(name, "_dropped")) {
dev->rte_xstats_ids[xstats_no] = id;
xstats_no++;
}
}
dev->rte_xstats_ids_size = xstats_no;
ret = true;
} else {
VLOG_WARN("Can't get XSTATS IDs for port: "
DPDK_PORT_ID_FMT, dev->port_id);
}
free(rte_xstats);
}
}
} else {
/* Already configured */
ret = true;
}
out:
if (!ret) {
netdev_dpdk_clear_xstats(dev);
}
return ret;
}
static int
netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
smap_add_format(args, "mtu", "%d", dev->mtu);
if (dev->type == DPDK_DEV_ETH) {
smap_add_format(args, "requested_rxq_descriptors", "%d",
dev->requested_rxq_size);
smap_add_format(args, "configured_rxq_descriptors", "%d",
dev->rxq_size);
smap_add_format(args, "requested_txq_descriptors", "%d",
dev->requested_txq_size);
smap_add_format(args, "configured_txq_descriptors", "%d",
dev->txq_size);
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
smap_add(args, "rx_csum_offload", "true");
} else {
smap_add(args, "rx_csum_offload", "false");
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports. Add Rx checksum offloading feature support on DPDK physical ports. By default, the Rx checksum offloading is enabled if NIC supports. However, the checksum offloading can be turned OFF either while adding a new DPDK physical port to OVS or at runtime. The rx checksum offloading can be turned off by setting the parameter to 'false'. For eg: To disable the rx checksum offloading when adding a port, 'ovs-vsctl add-port br0 dpdk0 -- \ set Interface dpdk0 type=dpdk options:rx-checksum-offload=false' OR (to disable at run time after port is being added to OVS) 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false' Similarly to turn ON rx checksum offloading at run time, 'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true' The Tx checksum offloading support is not implemented due to the following reasons. 1) Checksum offloading and vectorization are mutually exclusive in DPDK poll mode driver. Vector packet processing is turned OFF when checksum offloading is enabled which causes significant performance drop at Tx side. 2) Normally, OVS generates checksum for tunnel packets in software at the 'tunnel push' operation, where the tunnel headers are created. However enabling Tx checksum offloading involves, *) Mark every packets for tx checksum offloading at 'tunnel_push' and recirculate. *) At the time of xmit, validate the same flag and instruct the NIC to do the checksum calculation. In case NIC doesnt support Tx checksum offloading, the checksum calculation has to be done in software before sending out the packets. No significant performance improvement noticed with Tx checksum offloading due to the e overhead of additional validations + non vector packet processing. In some test scenarios, it introduces performance drop too. Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling decapsulation even though the SSE vector Rx function is disabled in DPDK poll mode driver. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Acked-by: Jesse Gross <jesse@kernel.org> Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
}
smap_add(args, "lsc_interrupt_mode",
dev->lsc_interrupt_mode ? "true" : "false");
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static struct netdev_dpdk *
netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
OVS_REQUIRES(dpdk_mutex)
{
struct netdev_dpdk *dev;
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (dev->port_id == port_id) {
return dev;
}
}
return NULL;
}
static dpdk_port_t
netdev_dpdk_get_port_by_mac(const char *mac_str)
{
dpdk_port_t port_id;
struct eth_addr mac, port_mac;
if (!eth_addr_from_string(mac_str, &mac)) {
VLOG_ERR("invalid mac: %s", mac_str);
return DPDK_ETH_PORT_ID_INVALID;
}
RTE_ETH_FOREACH_DEV (port_id) {
struct ether_addr ea;
rte_eth_macaddr_get(port_id, &ea);
memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
if (eth_addr_equals(mac, port_mac)) {
return port_id;
}
}
return DPDK_ETH_PORT_ID_INVALID;
}
/*
* Normally, a PCI id is enough for identifying a specific DPDK port.
* However, for some NICs having multiple ports sharing the same PCI
* id, using PCI id won't work then.
*
* To fix that, here one more method is introduced: "class=eth,mac=$MAC".
*
* Note that the compatibility is fully kept: user can still use the
* PCI id for adding ports (when it's enough for them).
*/
static dpdk_port_t
netdev-dpdk: Fix device leak on port deletion. Currently, once created device in dpdk will exist forever even after del-port operation untill we manually call 'ovs-appctl netdev-dpdk/detach <name>', where <name> is not the port's name but the name of dpdk eth device or pci address. Few issues with current implementation: 1. Different API for usual (system) and DPDK devices. (We have to call 'ovs-appctl netdev-dpdk/detach' each time after 'del-port' to actually free the device) This is a big issue mostly for virtual DPDK devices. 2. Follows from 1: For DPDK devices 'del-port' leads just to 'rte_eth_dev_stop' and subsequent 'add-port' will just start the already existing device. Such behaviour will not reset the device to initial state as it could be expected. For example: virtual pcap pmd will continue reading input file instead of reading it from the beginning. 3. Follows from 2: After execution of the following commands 'port1' will be configured with the 'old-options' while 'ovs-vsctl show' will show us 'new-options' in dpdk-devargs field: ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<old-options> ovs-vsctl del-port port1 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \ options:dpdk-devargs=<eth_pmd_name1>,<new-options> 4. Follows from 1: Not detached device consumes 'port_id'. Since we have very limited number of 'port_id's (32 in common case) this may lead to quick exhausting of id pool and inability to add any other port. To avoid above issues we need to detach all the attached devices on port destruction. appctl 'netdev-dpdk/detach' removed because not needed anymore. We need to use internal 'attached' variable to track ports on which rte_eth_dev_attach() was called and returned successfully to avoid closing and detaching devices that do not support hotplug or by any other reason attached using the 'dpdk-extra' cmdline options. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming") Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
2017-05-19 16:37:31 +03:00
netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
const char *devargs, char **errp)
{
char *name;
dpdk_port_t new_port_id = DPDK_ETH_PORT_ID_INVALID;
if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
} else {
name = xmemdup0(devargs, strcspn(devargs, ","));
if (rte_eth_dev_get_port_by_name(name, &new_port_id)
|| !rte_eth_dev_is_valid_port(new_port_id)) {
/* Device not found in DPDK, attempt to attach it */
if (!rte_eth_dev_attach(devargs, &new_port_id)) {
/* Attach successful */
dev->attached = true;
VLOG_INFO("Device '%s' attached to DPDK", devargs);
} else {
/* Attach unsuccessful */
new_port_id = DPDK_ETH_PORT_ID_INVALID;
}
}
free(name);
}
if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
}
return new_port_id;
}
static void
dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
OVS_REQUIRES(dev->mutex)
{
int new_n_rxq;
new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
if (new_n_rxq != dev->requested_n_rxq) {
dev->requested_n_rxq = new_n_rxq;
netdev_request_reconfigure(&dev->up);
}
}
static void
dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
const char *flag, int default_size, int *new_size)
{
int queue_size = smap_get_int(args, flag, default_size);
if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
|| !is_pow2(queue_size)) {
queue_size = default_size;
}
if (queue_size != *new_size) {
*new_size = queue_size;
netdev_request_reconfigure(netdev);
}
}
static int
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
char **errp)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
enum rte_eth_fc_mode fc_mode;
static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
{RTE_FC_NONE, RTE_FC_TX_PAUSE},
{RTE_FC_RX_PAUSE, RTE_FC_FULL }
};
const char *new_devargs;
int err = 0;
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
dpdk_set_rxq_config(dev, args);
dpdk_process_queue_size(netdev, args, "n_rxq_desc",
NIC_PORT_DEFAULT_RXQ_SIZE,
&dev->requested_rxq_size);
dpdk_process_queue_size(netdev, args, "n_txq_desc",
NIC_PORT_DEFAULT_TXQ_SIZE,
&dev->requested_txq_size);
new_devargs = smap_get(args, "dpdk-devargs");
if (dev->devargs && strcmp(new_devargs, dev->devargs)) {
/* The user requested a new device. If we return error, the caller
* will delete this netdev and try to recreate it. */
err = EAGAIN;
goto out;
}
/* dpdk-devargs is required for device configuration */
if (new_devargs && new_devargs[0]) {
/* Don't process dpdk-devargs if value is unchanged and port id
* is valid */
if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
&& rte_eth_dev_is_valid_port(dev->port_id))) {
dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
new_devargs,
errp);
if (!rte_eth_dev_is_valid_port(new_port_id)) {
err = EINVAL;
} else if (new_port_id == dev->port_id) {
/* Already configured, do not reconfigure again */
err = 0;
} else {
struct netdev_dpdk *dup_dev;
dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
if (dup_dev) {
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
"which is already in use by '%s'",
netdev_get_name(netdev), new_devargs,
netdev_get_name(&dup_dev->up));
err = EADDRINUSE;
} else {
int sid = rte_eth_dev_socket_id(new_port_id);
dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
dev->devargs = xstrdup(new_devargs);
dev->port_id = new_port_id;
netdev_request_reconfigure(&dev->up);
netdev_dpdk_clear_xstats(dev);
err = 0;
}
}
}
} else {
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
"The old 'dpdk<port_id>' names are not supported",
netdev_get_name(netdev));
err = EINVAL;
}
if (err) {
goto out;
}
lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false);
if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
netdev_request_reconfigure(netdev);
}
rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
dev->fc_conf.mode = fc_mode;
dev->fc_conf.autoneg = autoneg;
dpdk_eth_flow_ctrl_setup(dev);
}
out:
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static int
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
netdev_dpdk_ring_set_config(struct netdev *netdev, const struct smap *args,
char **errp OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
dpdk_set_rxq_config(dev, args);
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
netdev: Add 'errp' to set_config(). Since 55e075e65ef9("netdev-dpdk: Arbitrary 'dpdk' port naming"), set_config() is used to identify a DPDK device, so it's better to report its detailed error message to the user. Tunnel devices and patch ports rely a lot on set_config() as well. This commit adds a param to set_config() that can be used to return an error message and makes use of that in netdev-dpdk and netdev-vport. Before this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': dpdk0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: could not set configuration (Invalid argument). See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". After this patch: $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk ovs-vsctl: Error detected while setting up 'dpdk0': 'dpdk0' is missing 'options:dpdk-devargs'. The old 'dpdk<port_id>' names are not supported. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 p+ -- set Interface p+ type=patch ovs-vsctl: Error detected while setting up 'p+': p+: patch type requires valid 'peer' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". $ ovs-vsctl add-port br0 gnv0 -- set Interface gnv0 type=geneve ovs-vsctl: Error detected while setting up 'gnv0': gnv0: geneve type requires valid 'remote_ip' argument. See ovs-vswitchd log for details. ovs-vsctl: The default log directory is "/var/log/openvswitch/". CC: Ciara Loftus <ciara.loftus@intel.com> CC: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Tested-by: Ciara Loftus <ciara.loftus@intel.com>
2016-12-20 17:58:14 -08:00
const struct smap *args,
char **errp OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const char *path;
ovs_mutex_lock(&dev->mutex);
if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
path = smap_get(args, "vhost-server-path");
if (path && strcmp(path, dev->vhost_id)) {
strcpy(dev->vhost_id, path);
/* check zero copy configuration */
if (smap_get_bool(args, "dq-zero-copy", false)) {
dev->vhost_driver_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
} else {
dev->vhost_driver_flags &= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
}
netdev_request_reconfigure(netdev);
}
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_numa_id(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
return dev->socket_id;
}
/* Sets the number of tx queues for the dpdk interface. */
static int
netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->requested_n_txq == n_txq) {
goto out;
}
dev->requested_n_txq = n_txq;
netdev_request_reconfigure(netdev);
out:
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static struct netdev_rxq *
netdev_dpdk_rxq_alloc(void)
{
struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
if (rx) {
return &rx->up;
}
return NULL;
}
static struct netdev_rxq_dpdk *
netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
{
return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
}
static int
netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
ovs_mutex_lock(&dev->mutex);
rx->port_id = dev->port_id;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static void
netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
{
}
static void
netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
rte_free(rx);
}
/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
* 'pkts', even in case of failure.
*
* Returns the number of packets that weren't transmitted. */
static inline int
netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
struct rte_mbuf **pkts, int cnt)
{
uint32_t nb_tx = 0;
while (nb_tx != cnt) {
uint32_t ret;
ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
if (!ret) {
break;
}
nb_tx += ret;
}
if (OVS_UNLIKELY(nb_tx != cnt)) {
/* Free buffers, which we couldn't transmit, one at a time (each
netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads DPDK mempools rely on rte_lcore_id() to implement a thread-local cache. Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to the "thread-local" cache, causing crashes. This commit resolves the issue with the following changes: - Every non pmd thread has the same lcore_id (0, for management reasons), which is not shared with any pmd thread (lcore_id for pmd threads now start from 1) - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be held. - The previous change does not allow us anymore to pass DPDK mbufs to handler threads: therefore this commit partially revert 143859ec63d45e. Now packets are copied for upcall processing. We can remove the extra memcpy by processing upcalls in the pmd thread itself. With the introduction of the extra locking, the packet throughput will be lower in the following cases: - When using internal (tap) devices with DPDK devices on the same datapath. Anyway, to support internal devices efficiently, we needed DPDK KNI devices, which will be proper pmd devices and will not need this locking. - When packets are processed in the slow path by non pmd threads. This overhead can be avoided by handling the upcalls directly in pmd threads (a change that has already been proposed by Ryan Wilson) Also, the following two fixes have been introduced: - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put(). This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option - Do not bulk free mbufs in a transmission queue. They may belong to different mempools Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2014-07-17 14:29:36 -07:00
* packet could come from a different mempool) */
int i;
for (i = nb_tx; i < cnt; i++) {
rte_pktmbuf_free(pkts[i]);
netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads DPDK mempools rely on rte_lcore_id() to implement a thread-local cache. Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to the "thread-local" cache, causing crashes. This commit resolves the issue with the following changes: - Every non pmd thread has the same lcore_id (0, for management reasons), which is not shared with any pmd thread (lcore_id for pmd threads now start from 1) - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be held. - The previous change does not allow us anymore to pass DPDK mbufs to handler threads: therefore this commit partially revert 143859ec63d45e. Now packets are copied for upcall processing. We can remove the extra memcpy by processing upcalls in the pmd thread itself. With the introduction of the extra locking, the packet throughput will be lower in the following cases: - When using internal (tap) devices with DPDK devices on the same datapath. Anyway, to support internal devices efficiently, we needed DPDK KNI devices, which will be proper pmd devices and will not need this locking. - When packets are processed in the slow path by non pmd threads. This overhead can be avoided by handling the upcalls directly in pmd threads (a change that has already been proposed by Ryan Wilson) Also, the following two fixes have been introduced: - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put(). This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option - Do not bulk free mbufs in a transmission queue. They may belong to different mempools Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2014-07-17 14:29:36 -07:00
}
}
return cnt - nb_tx;
}
static inline bool
netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
struct rte_mbuf *pkt, uint64_t time)
{
uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
return rte_meter_srtcm_color_blind_check(meter, time, pkt_len) ==
e_RTE_METER_GREEN;
}
static int
netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt = NULL;
uint64_t current_time = rte_rdtsc();
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
/* Handle current packet */
if (netdev_dpdk_policer_pkt_handle(meter, pkt, current_time)) {
if (cnt != i) {
pkts[cnt] = pkt;
}
cnt++;
} else {
if (should_steal) {
rte_pktmbuf_free(pkt);
}
}
}
return cnt;
}
static int
ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
int pkt_cnt, bool should_steal)
{
int cnt = 0;
rte_spinlock_lock(&policer->policer_lock);
cnt = netdev_dpdk_policer_run(&policer->in_policer, pkts,
pkt_cnt, should_steal);
rte_spinlock_unlock(&policer->policer_lock);
return cnt;
}
static bool
is_vhost_running(struct netdev_dpdk *dev)
{
return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
}
static inline void
netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
unsigned int packet_size)
{
/* Hard-coded search for the size bucket. */
if (packet_size < 256) {
if (packet_size >= 128) {
stats->rx_128_to_255_packets++;
} else if (packet_size <= 64) {
stats->rx_1_to_64_packets++;
} else {
stats->rx_65_to_127_packets++;
}
} else {
if (packet_size >= 1523) {
stats->rx_1523_to_max_packets++;
} else if (packet_size >= 1024) {
stats->rx_1024_to_1522_packets++;
} else if (packet_size < 512) {
stats->rx_256_to_511_packets++;
} else {
stats->rx_512_to_1023_packets++;
}
}
}
static inline void
netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
struct dp_packet **packets, int count,
int dropped)
{
int i;
unsigned int packet_size;
struct dp_packet *packet;
stats->rx_packets += count;
stats->rx_dropped += dropped;
for (i = 0; i < count; i++) {
packet = packets[i];
packet_size = dp_packet_size(packet);
if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
/* This only protects the following multicast counting from
* too short packets, but it does not stop the packet from
* further processing. */
stats->rx_errors++;
stats->rx_length_errors++;
continue;
}
netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
stats->multicast++;
}
stats->rx_bytes += packet_size;
}
}
/*
* The receive path for the vhost port is the TX path out from guest.
*/
static int
netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
struct dp_packet_batch *batch, int *qfill)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
uint16_t nb_rx = 0;
uint16_t dropped = 0;
int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
netdev-dpdk: Fix calling vhost API with negative vid. Currently, rx and tx functions for vhost interfaces always obtain 'vid' twice. First time inside 'is_vhost_running' for checking the value and the second time in enqueue/dequeue function calls to send/receive packets. But second time we're not checking the returned value. If vhost device will be destroyed between checking and enqueue/dequeue, DPDK API will be called with '-1' instead of valid 'vid'. DPDK API does not validate the 'vid'. This leads to getting random memory value as a pointer to internal device structure inside DPDK. Access by this pointer leads to segmentation fault. For example: |00503|dpdk|INFO|VHOST_CONFIG: read message VHOST_USER_GET_VRING_BASE [New Thread 0x7fb6754910 (LWP 21246)] Program received signal SIGSEGV, Segmentation fault. rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 630 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) (gdb) bt full #0 rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 dev = 0xffffffff #1 __netdev_dpdk_vhost_send at lib/netdev-dpdk.c:1803 tx_pkts = <optimized out> cur_pkts = 0x7f340084f0 total_pkts = 32 dropped = 0 i = <optimized out> retries = 0 ... (gdb) p *((struct netdev_dpdk *) netdev) $8 = { ... , flags = (NETDEV_UP | NETDEV_PROMISC), ... , vid = {v = -1}, vhost_reconfigured = false, ... } Issue can be reproduced by stopping DPDK application (testpmd) inside guest while heavy traffic flows to this VM. Fix that by obtaining and checking the 'vid' only once. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 0a0f39df1d5a ("netdev-dpdk: Add support for DPDK 16.07") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Billy O'Mahony <billy.o.mahony@intel.com> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-10-06 13:50:14 +03:00
int vid = netdev_dpdk_get_vid(dev);
netdev-dpdk: Fix calling vhost API with negative vid. Currently, rx and tx functions for vhost interfaces always obtain 'vid' twice. First time inside 'is_vhost_running' for checking the value and the second time in enqueue/dequeue function calls to send/receive packets. But second time we're not checking the returned value. If vhost device will be destroyed between checking and enqueue/dequeue, DPDK API will be called with '-1' instead of valid 'vid'. DPDK API does not validate the 'vid'. This leads to getting random memory value as a pointer to internal device structure inside DPDK. Access by this pointer leads to segmentation fault. For example: |00503|dpdk|INFO|VHOST_CONFIG: read message VHOST_USER_GET_VRING_BASE [New Thread 0x7fb6754910 (LWP 21246)] Program received signal SIGSEGV, Segmentation fault. rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 630 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) (gdb) bt full #0 rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 dev = 0xffffffff #1 __netdev_dpdk_vhost_send at lib/netdev-dpdk.c:1803 tx_pkts = <optimized out> cur_pkts = 0x7f340084f0 total_pkts = 32 dropped = 0 i = <optimized out> retries = 0 ... (gdb) p *((struct netdev_dpdk *) netdev) $8 = { ... , flags = (NETDEV_UP | NETDEV_PROMISC), ... , vid = {v = -1}, vhost_reconfigured = false, ... } Issue can be reproduced by stopping DPDK application (testpmd) inside guest while heavy traffic flows to this VM. Fix that by obtaining and checking the 'vid' only once. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 0a0f39df1d5a ("netdev-dpdk: Add support for DPDK 16.07") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Billy O'Mahony <billy.o.mahony@intel.com> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-10-06 13:50:14 +03:00
if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
|| !(dev->flags & NETDEV_UP))) {
return EAGAIN;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
(struct rte_mbuf **) batch->packets,
NETDEV_MAX_BURST);
if (!nb_rx) {
return EAGAIN;
}
if (qfill) {
if (nb_rx == NETDEV_MAX_BURST) {
/* The DPDK API returns a uint32_t which often has invalid bits in
* the upper 16-bits. Need to restrict the value to uint16_t. */
*qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
} else {
*qfill = 0;
}
}
if (policer) {
dropped = nb_rx;
nb_rx = ingress_policer_run(policer,
(struct rte_mbuf **) batch->packets,
nb_rx, true);
dropped -= nb_rx;
}
rte_spinlock_lock(&dev->stats_lock);
netdev_dpdk_vhost_update_rx_counters(&dev->stats, batch->packets,
nb_rx, dropped);
rte_spinlock_unlock(&dev->stats_lock);
batch->count = nb_rx;
dp_packet_batch_init_packet_fields(batch);
return 0;
}
static int
netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
int *qfill)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
int nb_rx;
int dropped = 0;
if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
return EAGAIN;
}
nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
(struct rte_mbuf **) batch->packets,
NETDEV_MAX_BURST);
if (!nb_rx) {
return EAGAIN;
}
if (policer) {
dropped = nb_rx;
nb_rx = ingress_policer_run(policer,
(struct rte_mbuf **) batch->packets,
nb_rx, true);
dropped -= nb_rx;
}
/* Update stats to reflect dropped packets */
if (OVS_UNLIKELY(dropped)) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.rx_dropped += dropped;
rte_spinlock_unlock(&dev->stats_lock);
}
batch->count = nb_rx;
dp_packet_batch_init_packet_fields(batch);
if (qfill) {
if (nb_rx == NETDEV_MAX_BURST) {
*qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
} else {
*qfill = 0;
}
}
return 0;
}
static inline int
netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int cnt, bool should_steal)
{
struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
if (qos_conf) {
rte_spinlock_lock(&qos_conf->lock);
cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
rte_spinlock_unlock(&qos_conf->lock);
}
return cnt;
}
static int
netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int pkt_cnt)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt;
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
dev->up.name, pkt->pkt_len, dev->max_packet_len);
rte_pktmbuf_free(pkt);
continue;
}
if (OVS_UNLIKELY(i != cnt)) {
pkts[cnt] = pkt;
}
cnt++;
}
return cnt;
}
static inline void
netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
struct dp_packet **packets,
int attempted,
int dropped)
{
int i;
int sent = attempted - dropped;
stats->tx_packets += sent;
stats->tx_dropped += dropped;
for (i = 0; i < sent; i++) {
stats->tx_bytes += dp_packet_size(packets[i]);
}
}
static void
__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
struct dp_packet **pkts, int cnt)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
unsigned int total_pkts = cnt;
unsigned int dropped = 0;
int i, retries = 0;
netdev-dpdk: Fix calling vhost API with negative vid. Currently, rx and tx functions for vhost interfaces always obtain 'vid' twice. First time inside 'is_vhost_running' for checking the value and the second time in enqueue/dequeue function calls to send/receive packets. But second time we're not checking the returned value. If vhost device will be destroyed between checking and enqueue/dequeue, DPDK API will be called with '-1' instead of valid 'vid'. DPDK API does not validate the 'vid'. This leads to getting random memory value as a pointer to internal device structure inside DPDK. Access by this pointer leads to segmentation fault. For example: |00503|dpdk|INFO|VHOST_CONFIG: read message VHOST_USER_GET_VRING_BASE [New Thread 0x7fb6754910 (LWP 21246)] Program received signal SIGSEGV, Segmentation fault. rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 630 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) (gdb) bt full #0 rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 dev = 0xffffffff #1 __netdev_dpdk_vhost_send at lib/netdev-dpdk.c:1803 tx_pkts = <optimized out> cur_pkts = 0x7f340084f0 total_pkts = 32 dropped = 0 i = <optimized out> retries = 0 ... (gdb) p *((struct netdev_dpdk *) netdev) $8 = { ... , flags = (NETDEV_UP | NETDEV_PROMISC), ... , vid = {v = -1}, vhost_reconfigured = false, ... } Issue can be reproduced by stopping DPDK application (testpmd) inside guest while heavy traffic flows to this VM. Fix that by obtaining and checking the 'vid' only once. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 0a0f39df1d5a ("netdev-dpdk: Add support for DPDK 16.07") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Billy O'Mahony <billy.o.mahony@intel.com> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-10-06 13:50:14 +03:00
int vid = netdev_dpdk_get_vid(dev);
qid = dev->tx_q[qid % netdev->n_txq].map;
netdev-dpdk: Fix calling vhost API with negative vid. Currently, rx and tx functions for vhost interfaces always obtain 'vid' twice. First time inside 'is_vhost_running' for checking the value and the second time in enqueue/dequeue function calls to send/receive packets. But second time we're not checking the returned value. If vhost device will be destroyed between checking and enqueue/dequeue, DPDK API will be called with '-1' instead of valid 'vid'. DPDK API does not validate the 'vid'. This leads to getting random memory value as a pointer to internal device structure inside DPDK. Access by this pointer leads to segmentation fault. For example: |00503|dpdk|INFO|VHOST_CONFIG: read message VHOST_USER_GET_VRING_BASE [New Thread 0x7fb6754910 (LWP 21246)] Program received signal SIGSEGV, Segmentation fault. rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 630 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) (gdb) bt full #0 rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 dev = 0xffffffff #1 __netdev_dpdk_vhost_send at lib/netdev-dpdk.c:1803 tx_pkts = <optimized out> cur_pkts = 0x7f340084f0 total_pkts = 32 dropped = 0 i = <optimized out> retries = 0 ... (gdb) p *((struct netdev_dpdk *) netdev) $8 = { ... , flags = (NETDEV_UP | NETDEV_PROMISC), ... , vid = {v = -1}, vhost_reconfigured = false, ... } Issue can be reproduced by stopping DPDK application (testpmd) inside guest while heavy traffic flows to this VM. Fix that by obtaining and checking the 'vid' only once. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 0a0f39df1d5a ("netdev-dpdk: Add support for DPDK 16.07") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Billy O'Mahony <billy.o.mahony@intel.com> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-10-06 13:50:14 +03:00
if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
|| !(dev->flags & NETDEV_UP))) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped+= cnt;
rte_spinlock_unlock(&dev->stats_lock);
goto out;
}
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
/* Check has QoS has been configured for the netdev */
cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
dropped = total_pkts - cnt;
do {
int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
unsigned int tx_pkts;
netdev-dpdk: Fix calling vhost API with negative vid. Currently, rx and tx functions for vhost interfaces always obtain 'vid' twice. First time inside 'is_vhost_running' for checking the value and the second time in enqueue/dequeue function calls to send/receive packets. But second time we're not checking the returned value. If vhost device will be destroyed between checking and enqueue/dequeue, DPDK API will be called with '-1' instead of valid 'vid'. DPDK API does not validate the 'vid'. This leads to getting random memory value as a pointer to internal device structure inside DPDK. Access by this pointer leads to segmentation fault. For example: |00503|dpdk|INFO|VHOST_CONFIG: read message VHOST_USER_GET_VRING_BASE [New Thread 0x7fb6754910 (LWP 21246)] Program received signal SIGSEGV, Segmentation fault. rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 630 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) (gdb) bt full #0 rte_vhost_enqueue_burst at lib/librte_vhost/virtio_net.c:630 dev = 0xffffffff #1 __netdev_dpdk_vhost_send at lib/netdev-dpdk.c:1803 tx_pkts = <optimized out> cur_pkts = 0x7f340084f0 total_pkts = 32 dropped = 0 i = <optimized out> retries = 0 ... (gdb) p *((struct netdev_dpdk *) netdev) $8 = { ... , flags = (NETDEV_UP | NETDEV_PROMISC), ... , vid = {v = -1}, vhost_reconfigured = false, ... } Issue can be reproduced by stopping DPDK application (testpmd) inside guest while heavy traffic flows to this VM. Fix that by obtaining and checking the 'vid' only once. CC: Ciara Loftus <ciara.loftus@intel.com> Fixes: 0a0f39df1d5a ("netdev-dpdk: Add support for DPDK 16.07") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Billy O'Mahony <billy.o.mahony@intel.com> Acked-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2017-10-06 13:50:14 +03:00
tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
if (OVS_LIKELY(tx_pkts)) {
/* Packets have been sent.*/
cnt -= tx_pkts;
/* Prepare for possible retry.*/
cur_pkts = &cur_pkts[tx_pkts];
} else {
/* No packets sent - do not retry.*/
break;
}
} while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM));
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
rte_spinlock_lock(&dev->stats_lock);
netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
cnt + dropped);
rte_spinlock_unlock(&dev->stats_lock);
out:
for (i = 0; i < total_pkts - dropped; i++) {
dp_packet_delete(pkts[i]);
}
}
/* Tx function. Transmit packets indefinitely */
static void
dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads DPDK mempools rely on rte_lcore_id() to implement a thread-local cache. Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to the "thread-local" cache, causing crashes. This commit resolves the issue with the following changes: - Every non pmd thread has the same lcore_id (0, for management reasons), which is not shared with any pmd thread (lcore_id for pmd threads now start from 1) - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be held. - The previous change does not allow us anymore to pass DPDK mbufs to handler threads: therefore this commit partially revert 143859ec63d45e. Now packets are copied for upcall processing. We can remove the extra memcpy by processing upcalls in the pmd thread itself. With the introduction of the extra locking, the packet throughput will be lower in the following cases: - When using internal (tap) devices with DPDK devices on the same datapath. Anyway, to support internal devices efficiently, we needed DPDK KNI devices, which will be proper pmd devices and will not need this locking. - When packets are processed in the slow path by non pmd threads. This overhead can be avoided by handling the upcalls directly in pmd threads (a change that has already been proposed by Ryan Wilson) Also, the following two fixes have been introduced: - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put(). This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option - Do not bulk free mbufs in a transmission queue. They may belong to different mempools Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2014-07-17 14:29:36 -07:00
OVS_NO_THREAD_SAFETY_ANALYSIS
{
const size_t batch_cnt = dp_packet_batch_size(batch);
#if !defined(__CHECKER__) && !defined(_WIN32)
const size_t PKT_ARRAY_SIZE = batch_cnt;
#else
/* Sparse or MSVC doesn't like variable length array. */
enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
uint32_t cnt = batch_cnt;
uint32_t dropped = 0;
if (dev->type != DPDK_DEV_VHOST) {
/* Check if QoS has been configured for this netdev. */
cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
batch_cnt, false);
dropped += batch_cnt - cnt;
}
uint32_t txcnt = 0;
for (uint32_t i = 0; i < cnt; i++) {
struct dp_packet *packet = batch->packets[i];
uint32_t size = dp_packet_size(packet);
if (OVS_UNLIKELY(size > dev->max_packet_len)) {
VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
size, dev->max_packet_len);
dropped++;
continue;
}
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
if (OVS_UNLIKELY(!pkts[txcnt])) {
dropped += cnt - i;
break;
}
/* We have to do a copy for now */
memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
dp_packet_data(packet), size);
dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
txcnt++;
}
if (OVS_LIKELY(txcnt)) {
if (dev->type == DPDK_DEV_VHOST) {
__netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) pkts,
txcnt);
} else {
dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt);
}
}
netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads DPDK mempools rely on rte_lcore_id() to implement a thread-local cache. Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to the "thread-local" cache, causing crashes. This commit resolves the issue with the following changes: - Every non pmd thread has the same lcore_id (0, for management reasons), which is not shared with any pmd thread (lcore_id for pmd threads now start from 1) - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be held. - The previous change does not allow us anymore to pass DPDK mbufs to handler threads: therefore this commit partially revert 143859ec63d45e. Now packets are copied for upcall processing. We can remove the extra memcpy by processing upcalls in the pmd thread itself. With the introduction of the extra locking, the packet throughput will be lower in the following cases: - When using internal (tap) devices with DPDK devices on the same datapath. Anyway, to support internal devices efficiently, we needed DPDK KNI devices, which will be proper pmd devices and will not need this locking. - When packets are processed in the slow path by non pmd threads. This overhead can be avoided by handling the upcalls directly in pmd threads (a change that has already been proposed by Ryan Wilson) Also, the following two fixes have been introduced: - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put(). This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option - Do not bulk free mbufs in a transmission queue. They may belong to different mempools Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2014-07-17 14:29:36 -07:00
if (OVS_UNLIKELY(dropped)) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dropped;
rte_spinlock_unlock(&dev->stats_lock);
}
}
static int
netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch,
bool concurrent_txq OVS_UNUSED)
{
if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
dpdk_do_tx_copy(netdev, qid, batch);
dp_packet_delete_batch(batch, true);
} else {
__netdev_dpdk_vhost_send(netdev, qid, batch->packets, batch->count);
}
return 0;
}
static inline void
netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
struct dp_packet_batch *batch,
dpif-netdev: XPS (Transmit Packet Steering) implementation. If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
2016-07-27 17:44:41 +03:00
bool concurrent_txq)
{
if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
dp_packet_delete_batch(batch, true);
return;
}
dpif-netdev: XPS (Transmit Packet Steering) implementation. If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
2016-07-27 17:44:41 +03:00
if (OVS_UNLIKELY(concurrent_txq)) {
qid = qid % dev->up.n_txq;
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
}
if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
struct netdev *netdev = &dev->up;
dpdk_do_tx_copy(netdev, qid, batch);
dp_packet_delete_batch(batch, true);
} else {
int tx_cnt, dropped;
int batch_cnt = dp_packet_batch_size(batch);
struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
tx_cnt = netdev_dpdk_filter_packet_len(dev, pkts, batch_cnt);
tx_cnt = netdev_dpdk_qos_run(dev, pkts, tx_cnt, true);
dropped = batch_cnt - tx_cnt;
dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, tx_cnt);
if (OVS_UNLIKELY(dropped)) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dropped;
rte_spinlock_unlock(&dev->stats_lock);
}
}
dpif-netdev: XPS (Transmit Packet Steering) implementation. If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
2016-07-27 17:44:41 +03:00
if (OVS_UNLIKELY(concurrent_txq)) {
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
}
}
static int
netdev_dpdk_eth_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch, bool concurrent_txq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
return 0;
}
static int
netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (!eth_addr_equals(dev->hwaddr, mac)) {
dev->hwaddr = mac;
netdev_change_seq_changed(netdev);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
*mac = dev->hwaddr;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
*mtup = dev->mtu;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
netdev-dpdk: Fix requested MTU size validation. This commit replaces MTU_TO_FRAME_LEN(mtu) with MTU_TO_MAX_FRAME_LEN(mtu) in netdev_dpdk_set_mtu(), in order to determine if the total length of the L2 frame with an MTU of ’mtu’ exceeds NETDEV_DPDK_MAX_PKT_LEN. When setting an MTU we first check if the requested total frame length (which includes associated L2 overhead) will exceed the maximum frame length supported in netdev_dpdk_set_mtu(). The frame length is calculated by MTU_TO_FRAME_LEN as MTU + ETHER_HEADER + ETHER_CRC. The MTU for the device will be set at a later stage in dpdk_eth_dev_init() using rte_eth_dev_set_mtu(mtu). However when using rte_eth_dev_set_mtu(mtu) the calculation used to check that the frame does not exceed the max frame length for that device varies between DPDK device drivers. For example ixgbe driver calculates the frame length for a given MTU as mtu + ETHER_HDR_LEN + ETHER_CRC_LEN i40e driver calculates it as mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + I40E_VLAN_TAG_SIZE * 2 em driver calculates it as mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + VLAN_TAG_SIZE Currently it is possible to set an MTU for a netdev_dpdk device that exceeds the upper limit MTU for that devices DPDK driver. This leads to a segfault. This is because the frame length comparison as is, does not take into account the addition of the vlan tag overhead expected in the drivers. The netdev_dpdk_set_mtu() call will incorrectly succeed but the subsequent dpdk_eth_dev_init() will fail before the queues have been created for the DPDK device. This coupled with assumptions regarding reconfiguration requirements for the netdev will lead to a segfault when the rxq is polled for this device. A simple way to avoid this is by using MTU_TO_MAX_FRAME_LEN(mtu) when validating a requested MTU in netdev_dpdk_set_mtu(). MTU_TO_MAX_FRAME_LEN(mtu) is equivalent to the following: mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + (2 * VLAN_HEADER_LEN) By using MTU_TO_MAX_FRAME_LEN at the netdev_dpdk_set_mtu() stage, OvS now takes into account the maximum L2 overhead that a DPDK driver could allow for in its frame size calculation. This allows OVS to flag an error rather than the DPDK driver if the frame length exceeds the max DPDK frame length. OVS can fail gracefully at this point and use the default MTU of 1500 to continue to configure the port. Note: this fix is a work around, a better approach would be if DPDK devices could report the maximum MTU value that can be requested on a per device basis. This capability however is not currently available. A downside of this patch is that the MTU upper limit will be reduced by 8 bytes for DPDK devices that do not need to account for vlan tags in the frame length driver calculations e.g. ixgbe devices upper MTU limit is reduced from the OVS point of view from 9710 to 9702. CC: Mark Kavanagh <mark.b.kavanagh@intel.com> Fixes: 0072e931 ("netdev-dpdk: add support for jumbo frames") Signed-off-by: Ian Stokes <ian.stokes@intel.com> Co-authored-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> Acked-by: Flavio Leitner <fbl@sysclose.org>
2018-01-09 16:09:28 +00:00
/* XXX: Ensure that the overall frame length of the requested MTU does not
* surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
* the L2 frame length is calculated for a given MTU when
* rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
* headers, the em driver includes 1 x vlan header, the ixgbe driver does
* not include vlan headers. As such we should use
* MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
* (8 bytes) for comparison. This avoids a failure later with
* rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
* a method to retrieve the upper bound MTU for a given device.
*/
if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
|| mtu < ETHER_MIN_MTU) {
VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
return EINVAL;
}
ovs_mutex_lock(&dev->mutex);
if (dev->requested_mtu != mtu) {
dev->requested_mtu = mtu;
netdev_request_reconfigure(netdev);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
static int
netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
struct netdev_stats *stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
rte_spinlock_lock(&dev->stats_lock);
/* Supported Stats */
stats->rx_packets = dev->stats.rx_packets;
stats->tx_packets = dev->stats.tx_packets;
stats->rx_dropped = dev->stats.rx_dropped;
stats->tx_dropped = dev->stats.tx_dropped;
stats->multicast = dev->stats.multicast;
stats->rx_bytes = dev->stats.rx_bytes;
stats->tx_bytes = dev->stats.tx_bytes;
stats->rx_errors = dev->stats.rx_errors;
stats->rx_length_errors = dev->stats.rx_length_errors;
stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
rte_spinlock_unlock(&dev->stats_lock);
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static void
netdev_dpdk_convert_xstats(struct netdev_stats *stats,
const struct rte_eth_xstat *xstats,
const struct rte_eth_xstat_name *names,
const unsigned int size)
{
for (unsigned int i = 0; i < size; i++) {
if (strcmp(XSTAT_RX_64_PACKETS, names[i].name) == 0) {
stats->rx_1_to_64_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_65_TO_127_PACKETS, names[i].name) == 0) {
stats->rx_65_to_127_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_128_TO_255_PACKETS, names[i].name) == 0) {
stats->rx_128_to_255_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_256_TO_511_PACKETS, names[i].name) == 0) {
stats->rx_256_to_511_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_512_TO_1023_PACKETS, names[i].name) == 0) {
stats->rx_512_to_1023_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_1024_TO_1522_PACKETS, names[i].name) == 0) {
stats->rx_1024_to_1522_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
stats->rx_1523_to_max_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_64_PACKETS, names[i].name) == 0) {
stats->tx_1_to_64_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_65_TO_127_PACKETS, names[i].name) == 0) {
stats->tx_65_to_127_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_128_TO_255_PACKETS, names[i].name) == 0) {
stats->tx_128_to_255_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_256_TO_511_PACKETS, names[i].name) == 0) {
stats->tx_256_to_511_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_512_TO_1023_PACKETS, names[i].name) == 0) {
stats->tx_512_to_1023_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_1024_TO_1522_PACKETS, names[i].name) == 0) {
stats->tx_1024_to_1522_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
stats->tx_1523_to_max_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_MULTICAST_PACKETS, names[i].name) == 0) {
stats->multicast = xstats[i].value;
} else if (strcmp(XSTAT_TX_MULTICAST_PACKETS, names[i].name) == 0) {
stats->tx_multicast_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_BROADCAST_PACKETS, names[i].name) == 0) {
stats->rx_broadcast_packets = xstats[i].value;
} else if (strcmp(XSTAT_TX_BROADCAST_PACKETS, names[i].name) == 0) {
stats->tx_broadcast_packets = xstats[i].value;
} else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS, names[i].name) == 0) {
stats->rx_undersized_errors = xstats[i].value;
} else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS, names[i].name) == 0) {
stats->rx_fragmented_errors = xstats[i].value;
} else if (strcmp(XSTAT_RX_JABBER_ERRORS, names[i].name) == 0) {
stats->rx_jabber_errors = xstats[i].value;
}
}
}
static int
netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_stats rte_stats;
bool gg;
netdev_dpdk_get_carrier(netdev, &gg);
ovs_mutex_lock(&dev->mutex);
struct rte_eth_xstat *rte_xstats = NULL;
struct rte_eth_xstat_name *rte_xstats_names = NULL;
int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
dev->port_id);
ovs_mutex_unlock(&dev->mutex);
return EPROTO;
}
/* Get length of statistics */
rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
if (rte_xstats_len < 0) {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
/* Reserve memory for xstats names and values */
rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
/* Retreive xstats names */
rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
rte_xstats_names,
rte_xstats_len);
if (rte_xstats_new_len != rte_xstats_len) {
VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
/* Retreive xstats values */
memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
rte_xstats_len);
if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
rte_xstats_len);
} else {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
}
out:
free(rte_xstats);
free(rte_xstats_names);
stats->rx_packets = rte_stats.ipackets;
stats->tx_packets = rte_stats.opackets;
stats->rx_bytes = rte_stats.ibytes;
stats->tx_bytes = rte_stats.obytes;
stats->rx_errors = rte_stats.ierrors;
stats->tx_errors = rte_stats.oerrors;
rte_spinlock_lock(&dev->stats_lock);
stats->tx_dropped = dev->stats.tx_dropped;
stats->rx_dropped = dev->stats.rx_dropped;
rte_spinlock_unlock(&dev->stats_lock);
/* These are the available DPDK counters for packets not received due to
* local resource constraints in DPDK and NIC respectively. */
stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
stats->rx_missed_errors = rte_stats.imissed;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_custom_stats(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats)
{
uint32_t i;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int rte_xstats_ret;
ovs_mutex_lock(&dev->mutex);
if (netdev_dpdk_configure_xstats(dev)) {
uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
sizeof(uint64_t));
rte_xstats_ret =
rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
values, dev->rte_xstats_ids_size);
if (rte_xstats_ret > 0 &&
rte_xstats_ret <= dev->rte_xstats_ids_size) {
custom_stats->size = rte_xstats_ret;
custom_stats->counters =
(struct netdev_custom_counter *) xcalloc(rte_xstats_ret,
sizeof(struct netdev_custom_counter));
for (i = 0; i < rte_xstats_ret; i++) {
ovs_strlcpy(custom_stats->counters[i].name,
netdev_dpdk_get_xstat_name(dev,
dev->rte_xstats_ids[i]),
NETDEV_CUSTOM_STATS_NAME_SIZE);
custom_stats->counters[i].value = values[i];
}
} else {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
custom_stats->counters = NULL;
custom_stats->size = 0;
/* Let's clear statistics cache, so it will be
* reconfigured */
netdev_dpdk_clear_xstats(dev);
}
free(values);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_features(const struct netdev *netdev,
enum netdev_features *current,
enum netdev_features *advertised,
enum netdev_features *supported,
enum netdev_features *peer)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_link link;
ovs_mutex_lock(&dev->mutex);
link = dev->link;
ovs_mutex_unlock(&dev->mutex);
if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
if (link.link_speed == ETH_SPEED_NUM_10M) {
*current = NETDEV_F_10MB_HD;
}
if (link.link_speed == ETH_SPEED_NUM_100M) {
*current = NETDEV_F_100MB_HD;
}
if (link.link_speed == ETH_SPEED_NUM_1G) {
*current = NETDEV_F_1GB_HD;
}
} else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
if (link.link_speed == ETH_SPEED_NUM_10M) {
*current = NETDEV_F_10MB_FD;
}
if (link.link_speed == ETH_SPEED_NUM_100M) {
*current = NETDEV_F_100MB_FD;
}
if (link.link_speed == ETH_SPEED_NUM_1G) {
*current = NETDEV_F_1GB_FD;
}
if (link.link_speed == ETH_SPEED_NUM_10G) {
*current = NETDEV_F_10GB_FD;
}
}
if (link.link_autoneg) {
*current |= NETDEV_F_AUTONEG;
}
*advertised = *supported = *peer = 0;
return 0;
}
static struct ingress_policer *
netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
{
struct ingress_policer *policer = NULL;
uint64_t rate_bytes;
uint64_t burst_bytes;
int err = 0;
policer = xmalloc(sizeof *policer);
rte_spinlock_init(&policer->policer_lock);
/* rte_meter requires bytes so convert kbits rate and burst to bytes. */
rate_bytes = rate * 1000ULL / 8;
burst_bytes = burst * 1000ULL / 8;
policer->app_srtcm_params.cir = rate_bytes;
policer->app_srtcm_params.cbs = burst_bytes;
policer->app_srtcm_params.ebs = 0;
err = rte_meter_srtcm_config(&policer->in_policer,
&policer->app_srtcm_params);
if (err) {
VLOG_ERR("Could not create rte meter for ingress policer");
free(policer);
return NULL;
}
return policer;
}
static int
netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
uint32_t policer_burst)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct ingress_policer *policer;
/* Force to 0 if no rate specified,
* default to 8000 kbits if burst is 0,
* else stick with user-specified value.
*/
policer_burst = (!policer_rate ? 0
: !policer_burst ? 8000
: policer_burst);
ovs_mutex_lock(&dev->mutex);
policer = ovsrcu_get_protected(struct ingress_policer *,
&dev->ingress_policer);
if (dev->policer_rate == policer_rate &&
dev->policer_burst == policer_burst) {
/* Assume that settings haven't changed since we last set them. */
ovs_mutex_unlock(&dev->mutex);
return 0;
}
/* Destroy any existing ingress policer for the device if one exists */
if (policer) {
ovsrcu_postpone(free, policer);
}
if (policer_rate != 0) {
policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
} else {
policer = NULL;
}
ovsrcu_set(&dev->ingress_policer, policer);
dev->policer_rate = policer_rate;
dev->policer_burst = policer_burst;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_ifindex(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
/* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
* postive integer to meet RFC 2863 recommendations.
*/
int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
ovs_mutex_unlock(&dev->mutex);
return ifindex;
}
static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
check_link_status(dev);
*carrier = dev->link.link_status;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (is_vhost_running(dev)) {
*carrier = 1;
} else {
*carrier = 0;
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static long long int
netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
long long int carrier_resets;
ovs_mutex_lock(&dev->mutex);
carrier_resets = dev->link_reset_cnt;
ovs_mutex_unlock(&dev->mutex);
return carrier_resets;
}
static int
netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
long long int interval OVS_UNUSED)
{
return EOPNOTSUPP;
}
static int
netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
enum netdev_flags off, enum netdev_flags on,
enum netdev_flags *old_flagsp)
OVS_REQUIRES(dev->mutex)
{
if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
return EINVAL;
}
*old_flagsp = dev->flags;
dev->flags |= on;
dev->flags &= ~off;
if (dev->flags == *old_flagsp) {
return 0;
}
if (dev->type == DPDK_DEV_ETH) {
if (dev->flags & NETDEV_PROMISC) {
rte_eth_promiscuous_enable(dev->port_id);
}
netdev_change_seq_changed(&dev->up);
} else {
/* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
* running then change netdev's change_seq to trigger link state
* update. */
if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
&& is_vhost_running(dev)) {
netdev_change_seq_changed(&dev->up);
/* Clear statistics if device is getting up. */
if (NETDEV_UP & on) {
rte_spinlock_lock(&dev->stats_lock);
memset(&dev->stats, 0, sizeof dev->stats);
rte_spinlock_unlock(&dev->stats_lock);
}
}
}
return 0;
}
static int
netdev_dpdk_update_flags(struct netdev *netdev,
enum netdev_flags off, enum netdev_flags on,
enum netdev_flags *old_flagsp)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int error;
ovs_mutex_lock(&dev->mutex);
error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
int vid = netdev_dpdk_get_vid(dev);
if (vid < 0) {
smap_add_format(args, "status", "disconnected");
ovs_mutex_unlock(&dev->mutex);
return 0;
} else {
smap_add_format(args, "status", "connected");
}
char socket_name[PATH_MAX];
if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
smap_add_format(args, "socket", "%s", socket_name);
}
uint64_t features;
if (!rte_vhost_get_negotiated_features(vid, &features)) {
smap_add_format(args, "features", "0x%016"PRIx64, features);
}
uint16_t mtu;
if (!rte_vhost_get_mtu(vid, &mtu)) {
smap_add_format(args, "mtu", "%d", mtu);
}
int numa = rte_vhost_get_numa_node(vid);
if (numa >= 0) {
smap_add_format(args, "numa", "%d", numa);
}
uint16_t vring_num = rte_vhost_get_vring_num(vid);
if (vring_num) {
smap_add_format(args, "num_of_vrings", "%d", vring_num);
}
for (int i = 0; i < vring_num; i++) {
struct rte_vhost_vring vring;
rte_vhost_get_vhost_vring(vid, i, &vring);
smap_add_nocopy(args, xasprintf("vring_%d_size", i),
xasprintf("%d", vring.size));
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
if (!rte_eth_dev_is_valid_port(dev->port_id)) {
return ENODEV;
}
ovs_mutex_lock(&dev->mutex);
rte_eth_dev_info_get(dev->port_id, &dev_info);
ovs_mutex_unlock(&dev->mutex);
smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
smap_add_format(args, "numa_id", "%d",
rte_eth_dev_socket_id(dev->port_id));
smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
smap_add_format(args, "max_hash_mac_addrs", "%u",
dev_info.max_hash_mac_addrs);
smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
/* Querying the DPDK library for iftype may be done in future, pending
* support; cf. RFC 3635 Section 3.2.4. */
enum { IF_TYPE_ETHERNETCSMACD = 6 };
smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
smap_add_format(args, "if_descr", "%s %s", rte_version(),
dev_info.driver_name);
if (dev_info.pci_dev) {
smap_add_format(args, "pci-vendor_id", "0x%u",
dev_info.pci_dev->id.vendor_id);
smap_add_format(args, "pci-device_id", "0x%x",
dev_info.pci_dev->id.device_id);
}
return 0;
}
static void
netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
OVS_REQUIRES(dev->mutex)
{
enum netdev_flags old_flags;
if (admin_state) {
netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
} else {
netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
}
}
static void
netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
const char *argv[], void *aux OVS_UNUSED)
{
bool up;
if (!strcasecmp(argv[argc - 1], "up")) {
up = true;
} else if ( !strcasecmp(argv[argc - 1], "down")) {
up = false;
} else {
unixctl_command_reply_error(conn, "Invalid Admin State");
return;
}
if (argc > 2) {
struct netdev *netdev = netdev_from_name(argv[1]);
if (netdev && is_dpdk_class(netdev->netdev_class)) {
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
netdev_dpdk_set_admin_state__(dev, up);
ovs_mutex_unlock(&dev->mutex);
netdev_close(netdev);
} else {
unixctl_command_reply_error(conn, "Not a DPDK Interface");
netdev_close(netdev);
return;
}
} else {
struct netdev_dpdk *dev;
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
netdev_dpdk_set_admin_state__(dev, up);
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
}
unixctl_command_reply(conn, "OK");
}
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
static void
netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
const char *argv[], void *aux OVS_UNUSED)
{
int ret;
char *response;
dpdk_port_t port_id;
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
char devname[RTE_ETH_NAME_MAX_LEN];
struct netdev_dpdk *dev;
ovs_mutex_lock(&dpdk_mutex);
if (rte_eth_dev_get_port_by_name(argv[1], &port_id)) {
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
response = xasprintf("Device '%s' not found in DPDK", argv[1]);
goto error;
}
dev = netdev_dpdk_lookup_by_port_id(port_id);
if (dev) {
response = xasprintf("Device '%s' is being used by interface '%s'. "
"Remove it before detaching",
argv[1], netdev_get_name(&dev->up));
goto error;
}
rte_eth_dev_close(port_id);
ret = rte_eth_dev_detach(port_id, devname);
if (ret < 0) {
response = xasprintf("Device '%s' can not be detached", argv[1]);
goto error;
}
response = xasprintf("Device '%s' has been detached", argv[1]);
ovs_mutex_unlock(&dpdk_mutex);
unixctl_command_reply(conn, response);
free(response);
return;
error:
ovs_mutex_unlock(&dpdk_mutex);
unixctl_command_reply_error(conn, response);
free(response);
}
static void
netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
int argc, const char *argv[],
void *aux OVS_UNUSED)
{
size_t size;
FILE *stream;
char *response = NULL;
struct netdev *netdev = NULL;
if (argc == 2) {
netdev = netdev_from_name(argv[1]);
if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
unixctl_command_reply_error(conn, "Not a DPDK Interface");
goto out;
}
}
stream = open_memstream(&response, &size);
if (!stream) {
response = xasprintf("Unable to open memstream: %s.",
ovs_strerror(errno));
unixctl_command_reply_error(conn, response);
goto out;
}
if (netdev) {
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ovs_mutex_lock(&dpdk_mp_mutex);
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
rte_mempool_dump(stream, dev->dpdk_mp->mp);
ovs_mutex_unlock(&dpdk_mp_mutex);
ovs_mutex_unlock(&dev->mutex);
} else {
ovs_mutex_lock(&dpdk_mp_mutex);
rte_mempool_list_dump(stream);
ovs_mutex_unlock(&dpdk_mp_mutex);
}
fclose(stream);
unixctl_command_reply(conn, response);
out:
free(response);
netdev_close(netdev);
}
/*
* Set virtqueue flags so that we do not receive interrupts.
*/
static void
set_irq_status(int vid)
{
uint32_t i;
for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
rte_vhost_enable_guest_notification(vid, i, 0);
}
}
/*
* Fixes mapping for vhost-user tx queues. Must be called after each
* enabling/disabling of queues and n_txq modifications.
*/
static void
netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
int *enabled_queues, n_enabled = 0;
int i, k, total_txqs = dev->up.n_txq;
enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
for (i = 0; i < total_txqs; i++) {
/* Enabled queues always mapped to themselves. */
if (dev->tx_q[i].map == i) {
enabled_queues[n_enabled++] = i;
}
}
if (n_enabled == 0 && total_txqs != 0) {
enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
n_enabled = 1;
}
k = 0;
for (i = 0; i < total_txqs; i++) {
if (dev->tx_q[i].map != i) {
dev->tx_q[i].map = enabled_queues[k];
k = (k + 1) % n_enabled;
}
}
VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
for (i = 0; i < total_txqs; i++) {
VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
}
free(enabled_queues);
}
/*
* A new virtio-net device is added to a vhost port.
*/
static int
new_device(int vid)
{
struct netdev_dpdk *dev;
bool exists = false;
int newnode = 0;
char ifname[IF_NAME_SZ];
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
/* Add device to the vhost port with the same name as that passed down. */
LIST_FOR_EACH(dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
uint32_t qp_num = rte_vhost_get_vring_num(vid)/VIRTIO_QNUM;
/* Get NUMA information */
newnode = rte_vhost_get_numa_node(vid);
if (newnode == -1) {
#ifdef VHOST_NUMA
VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
ifname);
#endif
newnode = dev->socket_id;
}
if (dev->requested_n_txq != qp_num
|| dev->requested_n_rxq != qp_num
|| dev->requested_socket_id != newnode) {
dev->requested_socket_id = newnode;
dev->requested_n_rxq = qp_num;
dev->requested_n_txq = qp_num;
netdev_request_reconfigure(&dev->up);
} else {
/* Reconfiguration not required. */
dev->vhost_reconfigured = true;
}
ovsrcu_index_set(&dev->vid, vid);
exists = true;
/* Disable notifications. */
set_irq_status(vid);
netdev_change_seq_changed(&dev->up);
ovs_mutex_unlock(&dev->mutex);
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (!exists) {
VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
return -1;
}
VLOG_INFO("vHost Device '%s' has been added on numa node %i",
ifname, newnode);
return 0;
}
/* Clears mapping for all available queues of vhost interface. */
static void
netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
int i;
for (i = 0; i < dev->up.n_txq; i++) {
dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
}
}
/*
* Remove a virtio-net device from the specific vhost port. Use dev->remove
* flag to stop any more packets from being sent or received to/from a VM and
* ensure all currently queued packets have been sent/received before removing
* the device.
*/
static void
destroy_device(int vid)
{
struct netdev_dpdk *dev;
bool exists = false;
char ifname[IF_NAME_SZ];
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (netdev_dpdk_get_vid(dev) == vid) {
ovs_mutex_lock(&dev->mutex);
dev->vhost_reconfigured = false;
ovsrcu_index_set(&dev->vid, -1);
netdev_dpdk_txq_map_clear(dev);
netdev_change_seq_changed(&dev->up);
ovs_mutex_unlock(&dev->mutex);
exists = true;
break;
}
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
/*
* Wait for other threads to quiesce after setting the 'virtio_dev'
* to NULL, before returning.
*/
ovsrcu_synchronize();
/*
* As call to ovsrcu_synchronize() will end the quiescent state,
* put thread back into quiescent state before returning.
*/
ovsrcu_quiesce_start();
VLOG_INFO("vHost Device '%s' has been removed", ifname);
} else {
VLOG_INFO("vHost Device '%s' not found", ifname);
}
}
static int
vring_state_changed(int vid, uint16_t queue_id, int enable)
{
struct netdev_dpdk *dev;
bool exists = false;
int qid = queue_id / VIRTIO_QNUM;
char ifname[IF_NAME_SZ];
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
return 0;
}
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
if (enable) {
dev->tx_q[qid].map = qid;
} else {
dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
}
netdev_dpdk_remap_txqs(dev);
exists = true;
ovs_mutex_unlock(&dev->mutex);
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s'"
"changed to \'%s\'", queue_id, qid, ifname,
(enable == 1) ? "enabled" : "disabled");
} else {
VLOG_INFO("vHost Device '%s' not found", ifname);
return -1;
}
return 0;
}
/*
* Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
* or vhostuserclient netdev.
*
* Returns a value greater or equal to zero for a valid vid or '-1' if
* there is no valid vid associated. A vid of '-1' must not be used in
* rte_vhost_ APi calls.
*
* Once obtained and validated, a vid can be used by a PMD for multiple
* subsequent rte_vhost API calls until the PMD quiesces. A PMD should
* not fetch the vid again for each of a series of API calls.
*/
int
netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
{
return ovsrcu_index_get(&dev->vid);
}
struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
{
return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
}
static int
netdev_dpdk_class_init(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
/* This function can be called for different classes. The initialization
* needs to be done only once */
if (ovsthread_once_start(&once)) {
ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
unixctl_command_register("netdev-dpdk/set-admin-state",
"[netdev] up|down", 1, 2,
netdev_dpdk_set_admin_state, NULL);
dpdk: Fix device cleanup. Commit 5dcde09c80a8 was introduced to make detaching more automatic without using an additional command beyond ovs-vsctl del-port <br> <port>. Sometimes, since commit 5dcde09c80a8, dpdk devices are not detached when del-port is issued; command example: sudo ovs-vsctl del-port br0 dpdk1 This can happen when vswitchd is (re)started with an existing database and devices are already bound to dpdk. A minimal recipe to reproduce the issue is: 1/ Starting with darrell@prmh-nsx-perf-server125:~$ sudo ovs-vsctl show 1c50d8ee-b17f-4fac-a595-03b0da8c8275 Bridge "br0" Port "br0" Interface "br0" type: internal Port "dpdk1" Interface "dpdk1" type: dpdk options: {dpdk-devargs="0000:04:00.1"} Port "dpdk0" Interface "dpdk0" type: dpdk options: {dpdk-devargs="0000:04:00.0"} darrell@prmh-nsx-perf-server125:~$ /usr/src/dpdk-16.11/tools/dpdk-devbind.py --status Network devices using DPDK-compatible driver ============================================ 0000:04:00.0 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 0000:04:00.1 'Ethernet Controller 10-Gigabit X540-AT2' drv=uio_pci_generic unused=ixgbe,vfio-pci 2/ restart vswitchd 3/ run sudo ovs-vsctl del-port br0 dpdk1 and find the interface is NOT detached; there is no info log ‘Device '0000:04:00.1' detached’. A more verbose discussion is here: https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html along with another possible solution. Since we are nearing the end of a release, a safe approach is needed, at this time. One approach is to revert 5dcde09c80a8. This patch does not do that but reinstates the command ovs-appctl netdev-dpdk/detach to handle cases when del-port will not work. To detach the device, run the reinstated command ovs-appctl netdev-dpdk/detach 0000:04:00.1 Observe console output ‘Device '0000:04:00.1' has been detached’ Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.") CC: Ilya Maximets <i.maximets@samsung.com> Acked-by: Aaron Conole <aconole@redhat.com> Acked-by: Fischetti, Antonio <antonio.fischetti@intel.com> Signed-off-by: Darrell Ball <dlu998@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-01 17:04:29 -07:00
unixctl_command_register("netdev-dpdk/detach",
"pci address of device", 1, 1,
netdev_dpdk_detach, NULL);
unixctl_command_register("netdev-dpdk/get-mempool-info",
"[netdev]", 0, 1,
netdev_dpdk_get_mempool_info, NULL);
ovsthread_once_done(&once);
}
return 0;
}
/* Client Rings */
static int
dpdk_ring_create(const char dev_name[], unsigned int port_no,
dpdk_port_t *eth_port_id)
{
struct dpdk_ring *ring_pair;
char *ring_name;
int port_id;
ring_pair = dpdk_rte_mzalloc(sizeof *ring_pair);
if (!ring_pair) {
return ENOMEM;
}
/* XXX: Add support for multiquque ring. */
ring_name = xasprintf("%s_tx", dev_name);
/* Create single producer tx ring, netdev does explicit locking. */
ring_pair->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
RING_F_SP_ENQ);
free(ring_name);
if (ring_pair->cring_tx == NULL) {
rte_free(ring_pair);
return ENOMEM;
}
ring_name = xasprintf("%s_rx", dev_name);
/* Create single consumer rx ring, netdev does explicit locking. */
ring_pair->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
RING_F_SC_DEQ);
free(ring_name);
if (ring_pair->cring_rx == NULL) {
rte_free(ring_pair);
return ENOMEM;
}
port_id = rte_eth_from_rings(dev_name, &ring_pair->cring_rx, 1,
&ring_pair->cring_tx, 1, SOCKET0);
if (port_id < 0) {
rte_free(ring_pair);
return ENODEV;
}
ring_pair->user_port_id = port_no;
ring_pair->eth_port_id = port_id;
*eth_port_id = port_id;
ovs_list_push_back(&dpdk_ring_list, &ring_pair->list_node);
return 0;
}
static int
dpdk_ring_open(const char dev_name[], dpdk_port_t *eth_port_id)
OVS_REQUIRES(dpdk_mutex)
{
struct dpdk_ring *ring_pair;
unsigned int port_no;
int err = 0;
/* Names always start with "dpdkr" */
err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
if (err) {
return err;
}
/* Look through our list to find the device */
LIST_FOR_EACH (ring_pair, list_node, &dpdk_ring_list) {
if (ring_pair->user_port_id == port_no) {
VLOG_INFO("Found dpdk ring device %s:", dev_name);
/* Really all that is needed */
*eth_port_id = ring_pair->eth_port_id;
return 0;
}
}
/* Need to create the device rings */
return dpdk_ring_create(dev_name, port_no, eth_port_id);
}
static int
netdev_dpdk_ring_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch, bool concurrent_txq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct dp_packet *packet;
/* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
* the rss hash field is clear. This is because the same mbuf may be
* modified by the consumer of the ring and return into the datapath
* without recalculating the RSS hash. */
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
dp_packet_mbuf_rss_flag_reset(packet);
}
netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
return 0;
}
static int
netdev_dpdk_ring_construct(struct netdev *netdev)
{
dpdk_port_t port_no = 0;
int err = 0;
ovs_mutex_lock(&dpdk_mutex);
err = dpdk_ring_open(netdev->name, &port_no);
if (err) {
goto unlock_dpdk;
}
err = common_construct(netdev, port_no, DPDK_DEV_ETH,
rte_eth_dev_socket_id(port_no));
unlock_dpdk:
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
/* QoS Functions */
/*
* Initialize QoS configuration operations.
*/
static void
qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
{
conf->ops = ops;
rte_spinlock_init(&conf->lock);
}
/*
* Search existing QoS operations in qos_ops and compare each set of
* operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
* else return NULL
*/
static const struct dpdk_qos_ops *
qos_lookup_name(const char *name)
{
const struct dpdk_qos_ops *const *opsp;
for (opsp = qos_confs; *opsp != NULL; opsp++) {
const struct dpdk_qos_ops *ops = *opsp;
if (!strcmp(name, ops->qos_name)) {
return ops;
}
}
return NULL;
}
static int
netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
struct sset *types)
{
const struct dpdk_qos_ops *const *opsp;
for (opsp = qos_confs; *opsp != NULL; opsp++) {
const struct dpdk_qos_ops *ops = *opsp;
if (ops->qos_construct && ops->qos_name[0] != '\0') {
sset_add(types, ops->qos_name);
}
}
return 0;
}
static int
netdev_dpdk_get_qos(const struct netdev *netdev,
const char **typep, struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf) {
*typep = qos_conf->ops->qos_name;
error = (qos_conf->ops->qos_get
? qos_conf->ops->qos_get(qos_conf, details): 0);
} else {
/* No QoS configuration set, return an empty string */
*typep = "";
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
const struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const struct dpdk_qos_ops *new_ops = NULL;
struct qos_conf *qos_conf, *new_qos_conf = NULL;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
new_ops = qos_lookup_name(type);
if (!new_ops || !new_ops->qos_construct) {
new_qos_conf = NULL;
if (type && type[0]) {
error = EOPNOTSUPP;
}
} else if (qos_conf && qos_conf->ops == new_ops
&& qos_conf->ops->qos_is_equal(qos_conf, details)) {
new_qos_conf = qos_conf;
} else {
error = new_ops->qos_construct(details, &new_qos_conf);
}
if (error) {
VLOG_ERR("Failed to set QoS type %s on port %s: %s",
type, netdev->name, rte_strerror(error));
}
if (new_qos_conf != qos_conf) {
ovsrcu_set(&dev->qos_conf, new_qos_conf);
if (qos_conf) {
ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
}
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
/* egress-policer details */
struct egress_policer {
struct qos_conf qos_conf;
struct rte_meter_srtcm_params app_srtcm_params;
struct rte_meter_srtcm egress_meter;
};
static void
egress_policer_details_to_param(const struct smap *details,
struct rte_meter_srtcm_params *params)
{
memset(params, 0, sizeof *params);
params->cir = smap_get_ullong(details, "cir", 0);
params->cbs = smap_get_ullong(details, "cbs", 0);
params->ebs = 0;
}
static int
egress_policer_qos_construct(const struct smap *details,
struct qos_conf **conf)
{
struct egress_policer *policer;
int err = 0;
policer = xmalloc(sizeof *policer);
qos_conf_init(&policer->qos_conf, &egress_policer_ops);
egress_policer_details_to_param(details, &policer->app_srtcm_params);
err = rte_meter_srtcm_config(&policer->egress_meter,
&policer->app_srtcm_params);
if (!err) {
*conf = &policer->qos_conf;
} else {
free(policer);
*conf = NULL;
err = -err;
}
return err;
}
static void
egress_policer_qos_destruct(struct qos_conf *conf)
{
struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
qos_conf);
free(policer);
}
static int
egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
{
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
return 0;
}
static bool
egress_policer_qos_is_equal(const struct qos_conf *conf,
const struct smap *details)
{
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
struct rte_meter_srtcm_params params;
egress_policer_details_to_param(details, &params);
return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
}
static int
egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int cnt = 0;
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
cnt = netdev_dpdk_policer_run(&policer->egress_meter, pkts,
pkt_cnt, should_steal);
return cnt;
}
static const struct dpdk_qos_ops egress_policer_ops = {
"egress-policer", /* qos_name */
egress_policer_qos_construct,
egress_policer_qos_destruct,
egress_policer_qos_get,
egress_policer_qos_is_equal,
egress_policer_run
};
static int
netdev_dpdk_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err = 0;
ovs_mutex_lock(&dev->mutex);
if (netdev->n_txq == dev->requested_n_txq
&& netdev->n_rxq == dev->requested_n_rxq
&& dev->mtu == dev->requested_mtu
&& dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
&& dev->rxq_size == dev->requested_rxq_size
&& dev->txq_size == dev->requested_txq_size
&& dev->socket_id == dev->requested_socket_id
&& dev->started) {
/* Reconfiguration is unnecessary */
goto out;
}
rte_eth_dev_stop(dev->port_id);
dev->started = false;
err = netdev_dpdk_mempool_configure(dev);
if (err && err != EEXIST) {
goto out;
}
dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;
netdev->n_txq = dev->requested_n_txq;
netdev->n_rxq = dev->requested_n_rxq;
dev->rxq_size = dev->requested_rxq_size;
dev->txq_size = dev->requested_txq_size;
rte_free(dev->tx_q);
err = dpdk_eth_dev_init(dev);
dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
if (!dev->tx_q) {
err = ENOMEM;
}
netdev_change_seq_changed(netdev);
out:
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
dev->up.n_txq = dev->requested_n_txq;
dev->up.n_rxq = dev->requested_n_rxq;
int err;
/* Enable TX queue 0 by default if it wasn't disabled. */
if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
dev->tx_q[0].map = 0;
}
netdev_dpdk_remap_txqs(dev);
err = netdev_dpdk_mempool_configure(dev);
if (!err) {
dpdk: Support both shared and per port mempools. This commit re-introduces the concept of shared mempools as the default memory model for DPDK devices. Per port mempools are still available but must be enabled explicitly by a user. OVS previously used a shared mempool model for ports with the same MTU and socket configuration. This was replaced by a per port mempool model to address issues flagged by users such as: https://mail.openvswitch.org/pipermail/ovs-discuss/2016-September/042560.html However the per port model potentially requires an increase in memory resource requirements to support the same number of ports and configuration as the shared port model. This is considered a blocking factor for current deployments of OVS when upgrading to future OVS releases as a user may have to redimension memory for the same deployment configuration. This may not be possible for users. This commit resolves the issue by re-introducing shared mempools as the default memory behaviour in OVS DPDK but also refactors the memory configuration code to allow for per port mempools. This patch adds a new global config option, per-port-memory, that controls the enablement of per port mempools for DPDK devices. ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true This value defaults to false; to enable per port memory support, this field should be set to true when setting other global parameters on init (such as "dpdk-socket-mem", for example). Changing the value at runtime is not supported, and requires restarting the vswitch daemon. The mempool sweep functionality is also replaced with the sweep functionality from OVS 2.9 found in commits c77f692 (netdev-dpdk: Free mempool only when no in-use mbufs.) a7fb0a4 (netdev-dpdk: Add mempool reuse/free debug.) A new document to discuss the specifics of the memory models and example memory requirement calculations is also added. Signed-off-by: Ian Stokes <ian.stokes@intel.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Tiago Lam <tiago.lam@intel.com> Tested-by: Tiago Lam <tiago.lam@intel.com>
2018-06-27 14:58:31 +01:00
/* A new mempool was created or re-used. */
netdev_change_seq_changed(&dev->up);
} else if (err != EEXIST){
return err;
}
if (netdev_dpdk_get_vid(dev) >= 0) {
if (dev->vhost_reconfigured == false) {
dev->vhost_reconfigured = true;
/* Carrier status may need updating. */
netdev_change_seq_changed(&dev->up);
}
}
return 0;
}
static int
netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err;
ovs_mutex_lock(&dev->mutex);
err = dpdk_vhost_reconfigure_helper(dev);
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err;
uint64_t vhost_flags = 0;
bool zc_enabled;
ovs_mutex_lock(&dev->mutex);
/* Configure vHost client mode if requested and if the following criteria
* are met:
* 1. Device hasn't been registered yet.
* 2. A path has been specified.
*/
if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)
&& strlen(dev->vhost_id)) {
/* Register client-mode device. */
vhost_flags |= RTE_VHOST_USER_CLIENT;
/* Enable IOMMU support, if explicitly requested. */
if (dpdk_vhost_iommu_enabled()) {
vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
}
zc_enabled = dev->vhost_driver_flags
& RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
/* Enable zero copy flag, if requested */
if (zc_enabled) {
vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
}
err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
if (err) {
VLOG_ERR("vhost-user device setup failure for device %s\n",
dev->vhost_id);
goto unlock;
} else {
/* Configuration successful */
dev->vhost_driver_flags |= vhost_flags;
VLOG_INFO("vHost User device '%s' created in 'client' mode, "
"using client socket '%s'",
dev->up.name, dev->vhost_id);
if (zc_enabled) {
VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
}
}
err = rte_vhost_driver_callback_register(dev->vhost_id,
&virtio_net_device_ops);
if (err) {
VLOG_ERR("rte_vhost_driver_callback_register failed for "
"vhost user client port: %s\n", dev->up.name);
goto unlock;
}
err = rte_vhost_driver_disable_features(dev->vhost_id,
1ULL << VIRTIO_NET_F_HOST_TSO4
| 1ULL << VIRTIO_NET_F_HOST_TSO6
| 1ULL << VIRTIO_NET_F_CSUM);
if (err) {
VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
"client port: %s\n", dev->up.name);
goto unlock;
}
err = rte_vhost_driver_start(dev->vhost_id);
if (err) {
VLOG_ERR("rte_vhost_driver_start failed for vhost user "
"client port: %s\n", dev->up.name);
goto unlock;
}
}
err = dpdk_vhost_reconfigure_helper(dev);
unlock:
ovs_mutex_unlock(&dev->mutex);
return err;
}
/* Find rte_flow with @ufid */
static struct rte_flow *
ufid_to_rte_flow_find(const ovs_u128 *ufid) {
size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
struct ufid_to_rte_flow_data *data;
CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
if (ovs_u128_equals(*ufid, data->ufid)) {
return data->rte_flow;
}
}
return NULL;
}
static inline void
ufid_to_rte_flow_associate(const ovs_u128 *ufid,
struct rte_flow *rte_flow) {
size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
/*
* We should not simply overwrite an existing rte flow.
* We should have deleted it first before re-adding it.
* Thus, if following assert triggers, something is wrong:
* the rte_flow is not destroyed.
*/
ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
data->ufid = *ufid;
data->rte_flow = rte_flow;
cmap_insert(&ufid_to_rte_flow,
CONST_CAST(struct cmap_node *, &data->node), hash);
}
static inline void
ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
struct ufid_to_rte_flow_data *data;
CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
if (ovs_u128_equals(*ufid, data->ufid)) {
cmap_remove(&ufid_to_rte_flow,
CONST_CAST(struct cmap_node *, &data->node), hash);
free(data);
return;
}
}
VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
UUID_ARGS((struct uuid *)ufid));
}
/*
* To avoid individual xrealloc calls for each new element, a 'curent_max'
* is used to keep track of current allocated number of elements. Starts
* by 8 and doubles on each xrealloc call
*/
struct flow_patterns {
struct rte_flow_item *items;
int cnt;
int current_max;
};
struct flow_actions {
struct rte_flow_action *actions;
int cnt;
int current_max;
};
static void
dump_flow_pattern(struct rte_flow_item *item)
{
if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
const struct rte_flow_item_eth *eth_spec = item->spec;
const struct rte_flow_item_eth *eth_mask = item->mask;
VLOG_DBG("rte flow eth pattern:\n");
if (eth_spec) {
VLOG_DBG(" Spec: src="ETH_ADDR_FMT", dst="ETH_ADDR_FMT", "
"type=0x%04" PRIx16"\n",
eth_spec->src.addr_bytes[0], eth_spec->src.addr_bytes[1],
eth_spec->src.addr_bytes[2], eth_spec->src.addr_bytes[3],
eth_spec->src.addr_bytes[4], eth_spec->src.addr_bytes[5],
eth_spec->dst.addr_bytes[0], eth_spec->dst.addr_bytes[1],
eth_spec->dst.addr_bytes[2], eth_spec->dst.addr_bytes[3],
eth_spec->dst.addr_bytes[4], eth_spec->dst.addr_bytes[5],
ntohs(eth_spec->type));
} else {
VLOG_DBG(" Spec = null\n");
}
if (eth_mask) {
VLOG_DBG(" Mask: src="ETH_ADDR_FMT", dst="ETH_ADDR_FMT", "
"type=0x%04"PRIx16"\n",
eth_mask->src.addr_bytes[0], eth_mask->src.addr_bytes[1],
eth_mask->src.addr_bytes[2], eth_mask->src.addr_bytes[3],
eth_mask->src.addr_bytes[4], eth_mask->src.addr_bytes[5],
eth_mask->dst.addr_bytes[0], eth_mask->dst.addr_bytes[1],
eth_mask->dst.addr_bytes[2], eth_mask->dst.addr_bytes[3],
eth_mask->dst.addr_bytes[4], eth_mask->dst.addr_bytes[5],
eth_mask->type);
} else {
VLOG_DBG(" Mask = null\n");
}
}
if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) {
const struct rte_flow_item_vlan *vlan_spec = item->spec;
const struct rte_flow_item_vlan *vlan_mask = item->mask;
VLOG_DBG("rte flow vlan pattern:\n");
if (vlan_spec) {
VLOG_DBG(" Spec: tpid=0x%"PRIx16", tci=0x%"PRIx16"\n",
ntohs(vlan_spec->tpid), ntohs(vlan_spec->tci));
} else {
VLOG_DBG(" Spec = null\n");
}
if (vlan_mask) {
VLOG_DBG(" Mask: tpid=0x%"PRIx16", tci=0x%"PRIx16"\n",
vlan_mask->tpid, vlan_mask->tci);
} else {
VLOG_DBG(" Mask = null\n");
}
}
if (item->type == RTE_FLOW_ITEM_TYPE_IPV4) {
const struct rte_flow_item_ipv4 *ipv4_spec = item->spec;
const struct rte_flow_item_ipv4 *ipv4_mask = item->mask;
VLOG_DBG("rte flow ipv4 pattern:\n");
if (ipv4_spec) {
VLOG_DBG(" Spec: tos=0x%"PRIx8", ttl=%"PRIx8", proto=0x%"PRIx8
", src="IP_FMT", dst="IP_FMT"\n",
ipv4_spec->hdr.type_of_service,
ipv4_spec->hdr.time_to_live,
ipv4_spec->hdr.next_proto_id,
IP_ARGS(ipv4_spec->hdr.src_addr),
IP_ARGS(ipv4_spec->hdr.dst_addr));
} else {
VLOG_DBG(" Spec = null\n");
}
if (ipv4_mask) {
VLOG_DBG(" Mask: tos=0x%"PRIx8", ttl=%"PRIx8", proto=0x%"PRIx8
", src="IP_FMT", dst="IP_FMT"\n",
ipv4_mask->hdr.type_of_service,
ipv4_mask->hdr.time_to_live,
ipv4_mask->hdr.next_proto_id,
IP_ARGS(ipv4_mask->hdr.src_addr),
IP_ARGS(ipv4_mask->hdr.dst_addr));
} else {
VLOG_DBG(" Mask = null\n");
}
}
if (item->type == RTE_FLOW_ITEM_TYPE_UDP) {
const struct rte_flow_item_udp *udp_spec = item->spec;
const struct rte_flow_item_udp *udp_mask = item->mask;
VLOG_DBG("rte flow udp pattern:\n");
if (udp_spec) {
VLOG_DBG(" Spec: src_port=%"PRIu16", dst_port=%"PRIu16"\n",
ntohs(udp_spec->hdr.src_port),
ntohs(udp_spec->hdr.dst_port));
} else {
VLOG_DBG(" Spec = null\n");
}
if (udp_mask) {
VLOG_DBG(" Mask: src_port=0x%"PRIx16", dst_port=0x%"PRIx16"\n",
udp_mask->hdr.src_port,
udp_mask->hdr.dst_port);
} else {
VLOG_DBG(" Mask = null\n");
}
}
if (item->type == RTE_FLOW_ITEM_TYPE_SCTP) {
const struct rte_flow_item_sctp *sctp_spec = item->spec;
const struct rte_flow_item_sctp *sctp_mask = item->mask;
VLOG_DBG("rte flow sctp pattern:\n");
if (sctp_spec) {
VLOG_DBG(" Spec: src_port=%"PRIu16", dst_port=%"PRIu16"\n",
ntohs(sctp_spec->hdr.src_port),
ntohs(sctp_spec->hdr.dst_port));
} else {
VLOG_DBG(" Spec = null\n");
}
if (sctp_mask) {
VLOG_DBG(" Mask: src_port=0x%"PRIx16", dst_port=0x%"PRIx16"\n",
sctp_mask->hdr.src_port,
sctp_mask->hdr.dst_port);
} else {
VLOG_DBG(" Mask = null\n");
}
}
if (item->type == RTE_FLOW_ITEM_TYPE_ICMP) {
const struct rte_flow_item_icmp *icmp_spec = item->spec;
const struct rte_flow_item_icmp *icmp_mask = item->mask;
VLOG_DBG("rte flow icmp pattern:\n");
if (icmp_spec) {
VLOG_DBG(" Spec: icmp_type=%"PRIu8", icmp_code=%"PRIu8"\n",
ntohs(icmp_spec->hdr.icmp_type),
ntohs(icmp_spec->hdr.icmp_code));
} else {
VLOG_DBG(" Spec = null\n");
}
if (icmp_mask) {
VLOG_DBG(" Mask: icmp_type=0x%"PRIx8", icmp_code=0x%"PRIx8"\n",
icmp_spec->hdr.icmp_type,
icmp_spec->hdr.icmp_code);
} else {
VLOG_DBG(" Mask = null\n");
}
}
if (item->type == RTE_FLOW_ITEM_TYPE_TCP) {
const struct rte_flow_item_tcp *tcp_spec = item->spec;
const struct rte_flow_item_tcp *tcp_mask = item->mask;
VLOG_DBG("rte flow tcp pattern:\n");
if (tcp_spec) {
VLOG_DBG(" Spec: src_port=%"PRIu16", dst_port=%"PRIu16
", data_off=0x%"PRIx8", tcp_flags=0x%"PRIx8"\n",
ntohs(tcp_spec->hdr.src_port),
ntohs(tcp_spec->hdr.dst_port),
tcp_spec->hdr.data_off,
tcp_spec->hdr.tcp_flags);
} else {
VLOG_DBG(" Spec = null\n");
}
if (tcp_mask) {
VLOG_DBG(" Mask: src_port=%"PRIx16", dst_port=%"PRIx16
", data_off=0x%"PRIx8", tcp_flags=0x%"PRIx8"\n",
tcp_mask->hdr.src_port,
tcp_mask->hdr.dst_port,
tcp_mask->hdr.data_off,
tcp_mask->hdr.tcp_flags);
} else {
VLOG_DBG(" Mask = null\n");
}
}
}
static void
add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type type,
const void *spec, const void *mask) {
int cnt = patterns->cnt;
if (cnt == 0) {
patterns->current_max = 8;
patterns->items = xcalloc(patterns->current_max,
sizeof(struct rte_flow_item));
} else if (cnt == patterns->current_max) {
patterns->current_max *= 2;
patterns->items = xrealloc(patterns->items, patterns->current_max *
sizeof(struct rte_flow_item));
}
patterns->items[cnt].type = type;
patterns->items[cnt].spec = spec;
patterns->items[cnt].mask = mask;
patterns->items[cnt].last = NULL;
dump_flow_pattern(&patterns->items[cnt]);
patterns->cnt++;
}
static void
add_flow_action(struct flow_actions *actions, enum rte_flow_action_type type,
const void *conf)
{
int cnt = actions->cnt;
if (cnt == 0) {
actions->current_max = 8;
actions->actions = xcalloc(actions->current_max,
sizeof(struct rte_flow_action));
} else if (cnt == actions->current_max) {
actions->current_max *= 2;
actions->actions = xrealloc(actions->actions, actions->current_max *
sizeof(struct rte_flow_action));
}
actions->actions[cnt].type = type;
actions->actions[cnt].conf = conf;
actions->cnt++;
}
static struct rte_flow_action_rss *
add_flow_rss_action(struct flow_actions *actions,
struct netdev *netdev) {
int i;
struct rte_flow_action_rss *rss;
rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
/*
* Setting it to NULL will let the driver use the default RSS
* configuration we have set: &port_conf.rx_adv_conf.rss_conf.
*/
rss->rss_conf = NULL;
rss->num = netdev->n_rxq;
for (i = 0; i < rss->num; i++) {
rss->queue[i] = i;
}
add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
return rss;
}
static int
netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
const struct match *match,
struct nlattr *nl_actions OVS_UNUSED,
size_t actions_len OVS_UNUSED,
const ovs_u128 *ufid,
struct offload_info *info) {
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const struct rte_flow_attr flow_attr = {
.group = 0,
.priority = 0,
.ingress = 1,
.egress = 0
};
struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
struct flow_actions actions = { .actions = NULL, .cnt = 0 };
struct rte_flow *flow;
struct rte_flow_error error;
uint8_t *ipv4_next_proto_mask = NULL;
int ret = 0;
/* Eth */
struct rte_flow_item_eth eth_spec;
struct rte_flow_item_eth eth_mask;
memset(&eth_spec, 0, sizeof(eth_spec));
memset(&eth_mask, 0, sizeof(eth_mask));
if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
!eth_addr_is_zero(match->wc.masks.dl_dst)) {
rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
eth_spec.type = match->flow.dl_type;
rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
sizeof(eth_mask.dst));
rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
sizeof(eth_mask.src));
eth_mask.type = match->wc.masks.dl_type;
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
&eth_spec, &eth_mask);
} else {
/*
* If user specifies a flow (like UDP flow) without L2 patterns,
* OVS will at least set the dl_type. Normally, it's enough to
* create an eth pattern just with it. Unluckily, some Intel's
* NIC (such as XL710) doesn't support that. Below is a workaround,
* which simply matches any L2 pkts.
*/
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
}
/* VLAN */
struct rte_flow_item_vlan vlan_spec;
struct rte_flow_item_vlan vlan_mask;
memset(&vlan_spec, 0, sizeof(vlan_spec));
memset(&vlan_mask, 0, sizeof(vlan_mask));
if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
vlan_spec.tci = match->flow.vlans[0].tci & ~htons(VLAN_CFI);
vlan_mask.tci = match->wc.masks.vlans[0].tci & ~htons(VLAN_CFI);
/* match any protocols */
vlan_mask.tpid = 0;
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
&vlan_spec, &vlan_mask);
}
/* IP v4 */
uint8_t proto = 0;
struct rte_flow_item_ipv4 ipv4_spec;
struct rte_flow_item_ipv4 ipv4_mask;
memset(&ipv4_spec, 0, sizeof(ipv4_spec));
memset(&ipv4_mask, 0, sizeof(ipv4_mask));
if (match->flow.dl_type == ntohs(ETH_TYPE_IP)) {
ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
ipv4_spec.hdr.time_to_live = match->flow.nw_ttl;
ipv4_spec.hdr.next_proto_id = match->flow.nw_proto;
ipv4_spec.hdr.src_addr = match->flow.nw_src;
ipv4_spec.hdr.dst_addr = match->flow.nw_dst;
ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
ipv4_mask.hdr.time_to_live = match->wc.masks.nw_ttl;
ipv4_mask.hdr.next_proto_id = match->wc.masks.nw_proto;
ipv4_mask.hdr.src_addr = match->wc.masks.nw_src;
ipv4_mask.hdr.dst_addr = match->wc.masks.nw_dst;
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
&ipv4_spec, &ipv4_mask);
/* Save proto for L4 protocol setup */
proto = ipv4_spec.hdr.next_proto_id &
ipv4_mask.hdr.next_proto_id;
/* Remember proto mask address for later modification */
ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
}
if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP &&
proto != IPPROTO_SCTP && proto != IPPROTO_TCP &&
(match->wc.masks.tp_src ||
match->wc.masks.tp_dst ||
match->wc.masks.tcp_flags)) {
VLOG_DBG("L4 Protocol (%u) not supported", proto);
ret = -1;
goto out;
}
if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
(match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
ret = -1;
goto out;
}
struct rte_flow_item_tcp tcp_spec;
struct rte_flow_item_tcp tcp_mask;
memset(&tcp_spec, 0, sizeof(tcp_spec));
memset(&tcp_mask, 0, sizeof(tcp_mask));
if (proto == IPPROTO_TCP) {
tcp_spec.hdr.src_port = match->flow.tp_src;
tcp_spec.hdr.dst_port = match->flow.tp_dst;
tcp_spec.hdr.data_off = ntohs(match->flow.tcp_flags) >> 8;
tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
tcp_mask.hdr.src_port = match->wc.masks.tp_src;
tcp_mask.hdr.dst_port = match->wc.masks.tp_dst;
tcp_mask.hdr.data_off = ntohs(match->wc.masks.tcp_flags) >> 8;
tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
&tcp_spec, &tcp_mask);
/* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
if (ipv4_next_proto_mask) {
*ipv4_next_proto_mask = 0;
}
goto end_proto_check;
}
struct rte_flow_item_udp udp_spec;
struct rte_flow_item_udp udp_mask;
memset(&udp_spec, 0, sizeof(udp_spec));
memset(&udp_mask, 0, sizeof(udp_mask));
if (proto == IPPROTO_UDP) {
udp_spec.hdr.src_port = match->flow.tp_src;
udp_spec.hdr.dst_port = match->flow.tp_dst;
udp_mask.hdr.src_port = match->wc.masks.tp_src;
udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
&udp_spec, &udp_mask);
/* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
if (ipv4_next_proto_mask) {
*ipv4_next_proto_mask = 0;
}
goto end_proto_check;
}
struct rte_flow_item_sctp sctp_spec;
struct rte_flow_item_sctp sctp_mask;
memset(&sctp_spec, 0, sizeof(sctp_spec));
memset(&sctp_mask, 0, sizeof(sctp_mask));
if (proto == IPPROTO_SCTP) {
sctp_spec.hdr.src_port = match->flow.tp_src;
sctp_spec.hdr.dst_port = match->flow.tp_dst;
sctp_mask.hdr.src_port = match->wc.masks.tp_src;
sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
&sctp_spec, &sctp_mask);
/* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
if (ipv4_next_proto_mask) {
*ipv4_next_proto_mask = 0;
}
goto end_proto_check;
}
struct rte_flow_item_icmp icmp_spec;
struct rte_flow_item_icmp icmp_mask;
memset(&icmp_spec, 0, sizeof(icmp_spec));
memset(&icmp_mask, 0, sizeof(icmp_mask));
if (proto == IPPROTO_ICMP) {
icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
&icmp_spec, &icmp_mask);
/* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
if (ipv4_next_proto_mask) {
*ipv4_next_proto_mask = 0;
}
goto end_proto_check;
}
end_proto_check:
add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
struct rte_flow_action_mark mark;
mark.id = info->flow_mark;
add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
struct rte_flow_action_rss *rss;
rss = add_flow_rss_action(&actions, netdev);
add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
actions.actions, &error);
free(rss);
if (!flow) {
VLOG_ERR("rte flow creat error: %u : message : %s\n",
error.type, error.message);
ret = -1;
goto out;
}
ufid_to_rte_flow_associate(ufid, flow);
VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
flow, UUID_ARGS((struct uuid *)ufid));
out:
free(patterns.items);
free(actions.actions);
return ret;
}
static bool
is_all_zero(const void *addr, size_t n) {
size_t i = 0;
const uint8_t *p = (uint8_t *)addr;
for (i = 0; i < n; i++) {
if (p[i] != 0) {
return false;
}
}
return true;
}
/*
* Check if any unsupported flow patterns are specified.
*/
static int
netdev_dpdk_validate_flow(const struct match *match) {
struct match match_zero_wc;
/* Create a wc-zeroed version of flow */
match_init(&match_zero_wc, &match->flow, &match->wc);
if (!is_all_zero(&match_zero_wc.flow.tunnel,
sizeof(match_zero_wc.flow.tunnel))) {
goto err;
}
if (match->wc.masks.metadata ||
match->wc.masks.skb_priority ||
match->wc.masks.pkt_mark ||
match->wc.masks.dp_hash) {
goto err;
}
/* recirc id must be zero */
if (match_zero_wc.flow.recirc_id) {
goto err;
}
if (match->wc.masks.ct_state ||
match->wc.masks.ct_nw_proto ||
match->wc.masks.ct_zone ||
match->wc.masks.ct_mark ||
match->wc.masks.ct_label.u64.hi ||
match->wc.masks.ct_label.u64.lo) {
goto err;
}
if (match->wc.masks.conj_id ||
match->wc.masks.actset_output) {
goto err;
}
/* unsupported L2 */
if (!is_all_zero(&match->wc.masks.mpls_lse,
sizeof(match_zero_wc.flow.mpls_lse))) {
goto err;
}
/* unsupported L3 */
if (match->wc.masks.ipv6_label ||
match->wc.masks.ct_nw_src ||
match->wc.masks.ct_nw_dst ||
!is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr)) ||
!is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr)) ||
!is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct in6_addr)) ||
!is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct in6_addr)) ||
!is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr)) ||
!is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
!is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr)) ||
!is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr))) {
goto err;
}
/* If fragmented, then don't HW accelerate - for now */
if (match_zero_wc.flow.nw_frag) {
goto err;
}
/* unsupported L4 */
if (match->wc.masks.igmp_group_ip4 ||
match->wc.masks.ct_tp_src ||
match->wc.masks.ct_tp_dst) {
goto err;
}
return 0;
err:
VLOG_ERR("cannot HW accelerate this flow due to unsupported protocols");
return -1;
}
static int
netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
const ovs_u128 *ufid,
struct rte_flow *rte_flow) {
struct rte_flow_error error;
int ret;
ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
if (ret == 0) {
ufid_to_rte_flow_disassociate(ufid);
VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT "\n",
rte_flow, UUID_ARGS((struct uuid *)ufid));
} else {
VLOG_ERR("rte flow destroy error: %u : message : %s\n",
error.type, error.message);
}
return ret;
}
static int
netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
struct nlattr *actions, size_t actions_len,
const ovs_u128 *ufid, struct offload_info *info,
struct dpif_flow_stats *stats OVS_UNUSED) {
struct rte_flow *rte_flow;
int ret;
/*
* If an old rte_flow exists, it means it's a flow modification.
* Here destroy the old rte flow first before adding a new one.
*/
rte_flow = ufid_to_rte_flow_find(ufid);
if (rte_flow) {
ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
ufid, rte_flow);
if (ret < 0) {
return ret;
}
}
ret = netdev_dpdk_validate_flow(match);
if (ret < 0) {
return ret;
}
return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
actions_len, ufid, info);
}
static int
netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
struct dpif_flow_stats *stats OVS_UNUSED) {
struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
if (!rte_flow) {
return -1;
}
return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
ufid, rte_flow);
}
#define DPDK_FLOW_OFFLOAD_API \
NULL, /* flow_flush */ \
NULL, /* flow_dump_create */ \
NULL, /* flow_dump_destroy */ \
NULL, /* flow_dump_next */ \
netdev_dpdk_flow_put, \
NULL, /* flow_get */ \
netdev_dpdk_flow_del, \
NULL /* init_flow_api */
#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \
SET_CONFIG, SET_TX_MULTIQ, SEND, \
GET_CARRIER, GET_STATS, \
GET_CUSTOM_STATS, \
GET_FEATURES, GET_STATUS, \
RECONFIGURE, RXQ_RECV) \
{ \
NAME, \
true, /* is_pmd */ \
INIT, /* init */ \
NULL, /* netdev_dpdk_run */ \
NULL, /* netdev_dpdk_wait */ \
\
netdev_dpdk_alloc, \
CONSTRUCT, \
DESTRUCT, \
netdev_dpdk_dealloc, \
netdev_dpdk_get_config, \
SET_CONFIG, \
NULL, /* get_tunnel_config */ \
NULL, /* build header */ \
NULL, /* push header */ \
NULL, /* pop header */ \
netdev_dpdk_get_numa_id, /* get_numa_id */ \
SET_TX_MULTIQ, \
\
SEND, /* send */ \
NULL, /* send_wait */ \
\
netdev_dpdk_set_etheraddr, \
netdev_dpdk_get_etheraddr, \
netdev_dpdk_get_mtu, \
netdev_dpdk_set_mtu, \
netdev_dpdk_get_ifindex, \
GET_CARRIER, \
netdev_dpdk_get_carrier_resets, \
netdev_dpdk_set_miimon, \
GET_STATS, \
GET_CUSTOM_STATS, \
GET_FEATURES, \
NULL, /* set_advertisements */ \
NULL, /* get_pt_mode */ \
\
netdev_dpdk_set_policing, \
netdev_dpdk_get_qos_types, \
NULL, /* get_qos_capabilities */ \
netdev_dpdk_get_qos, \
netdev_dpdk_set_qos, \
NULL, /* get_queue */ \
NULL, /* set_queue */ \
NULL, /* delete_queue */ \
NULL, /* get_queue_stats */ \
NULL, /* queue_dump_start */ \
NULL, /* queue_dump_next */ \
NULL, /* queue_dump_done */ \
NULL, /* dump_queue_stats */ \
\
NULL, /* set_in4 */ \
NULL, /* get_addr_list */ \
NULL, /* add_router */ \
NULL, /* get_next_hop */ \
GET_STATUS, \
NULL, /* arp_lookup */ \
\
netdev_dpdk_update_flags, \
RECONFIGURE, \
\
netdev_dpdk_rxq_alloc, \
netdev_dpdk_rxq_construct, \
netdev_dpdk_rxq_destruct, \
netdev_dpdk_rxq_dealloc, \
RXQ_RECV, \
NULL, /* rx_wait */ \
NULL, /* rxq_drain */ \
DPDK_FLOW_OFFLOAD_API, \
NULL /* get_block_id */ \
}
static const struct netdev_class dpdk_class =
NETDEV_DPDK_CLASS(
"dpdk",
netdev_dpdk_class_init,
netdev_dpdk_construct,
netdev_dpdk_destruct,
netdev_dpdk_set_config,
netdev_dpdk_set_tx_multiq,
netdev_dpdk_eth_send,
netdev_dpdk_get_carrier,
netdev_dpdk_get_stats,
netdev_dpdk_get_custom_stats,
netdev_dpdk_get_features,
netdev_dpdk_get_status,
netdev_dpdk_reconfigure,
netdev_dpdk_rxq_recv);
static const struct netdev_class dpdk_ring_class =
NETDEV_DPDK_CLASS(
"dpdkr",
netdev_dpdk_class_init,
netdev_dpdk_ring_construct,
netdev_dpdk_destruct,
netdev_dpdk_ring_set_config,
netdev_dpdk_set_tx_multiq,
netdev_dpdk_ring_send,
netdev_dpdk_get_carrier,
netdev_dpdk_get_stats,
netdev_dpdk_get_custom_stats,
netdev_dpdk_get_features,
netdev_dpdk_get_status,
netdev_dpdk_reconfigure,
netdev_dpdk_rxq_recv);
static const struct netdev_class dpdk_vhost_class =
NETDEV_DPDK_CLASS(
"dpdkvhostuser",
NULL,
netdev_dpdk_vhost_construct,
netdev_dpdk_vhost_destruct,
NULL,
NULL,
netdev_dpdk_vhost_send,
netdev_dpdk_vhost_get_carrier,
netdev_dpdk_vhost_get_stats,
NULL,
NULL,
netdev_dpdk_vhost_user_get_status,
netdev_dpdk_vhost_reconfigure,
netdev_dpdk_vhost_rxq_recv);
static const struct netdev_class dpdk_vhost_client_class =
NETDEV_DPDK_CLASS(
"dpdkvhostuserclient",
NULL,
netdev_dpdk_vhost_client_construct,
netdev_dpdk_vhost_destruct,
netdev_dpdk_vhost_client_set_config,
NULL,
netdev_dpdk_vhost_send,
netdev_dpdk_vhost_get_carrier,
netdev_dpdk_vhost_get_stats,
NULL,
NULL,
netdev_dpdk_vhost_user_get_status,
netdev_dpdk_vhost_client_reconfigure,
netdev_dpdk_vhost_rxq_recv);
void
netdev_dpdk_register(void)
{
netdev_register_provider(&dpdk_class);
netdev_register_provider(&dpdk_ring_class);
netdev_register_provider(&dpdk_vhost_class);
netdev_register_provider(&dpdk_vhost_client_class);
}