2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00
ovs/lib/netdev-dpdk.c
David Marchand dd443c1a7a netdev-dpdk: Stop relying on vhost-user Tx flags.
vhost-user legacy behavior has been to mark mbuf with Tx offload flags
based on what the virtio-net header contained (but provide no
Rx information, like IP checksum or L4 checksum validity).

Changing to the non legacy mode means that no code out of OVS should set
any RTE_MBUF_F_TX_* flag. Had a check accordingly.

Link: https://git.dpdk.org/dpdk/commit/?id=ca7036b4af3a
Reported-at: https://issues.redhat.com/browse/FDP-1147
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-19 21:03:15 +02:00

6963 lines
222 KiB
C

/*
* Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "netdev-dpdk.h"
#include <errno.h>
#include <signal.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/virtio_net.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <linux/if.h>
#include <rte_bus.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_dev.h>
#include <rte_errno.h>
#include <rte_ethdev.h>
#include <rte_flow.h>
#include <rte_malloc.h>
#include <rte_mbuf.h>
#include <rte_meter.h>
#include <rte_pci.h>
#include <rte_version.h>
#include <rte_vhost.h>
#include "cmap.h"
#include "coverage.h"
#include "dirs.h"
#include "dp-packet.h"
#include "dpdk.h"
#include "dpif-netdev.h"
#include "fatal-signal.h"
#include "if-notifier.h"
#include "mpsc-queue.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "odp-util.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/match.h"
#include "openvswitch/ofp-parse.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/poll-loop.h"
#include "openvswitch/shash.h"
#include "openvswitch/vlog.h"
#include "ovs-numa.h"
#include "ovs-rcu.h"
#include "ovs-thread.h"
#include "packets.h"
#include "smap.h"
#include "sset.h"
#include "timeval.h"
#include "unaligned.h"
#include "unixctl.h"
#include "userspace-tso.h"
#include "util.h"
#include "uuid.h"
enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
COVERAGE_DEFINE(vhost_tx_contention);
static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */
static bool vhost_postcopy_enabled = false; /* Status of vHost POSTCOPY
* support. */
static bool per_port_memory = false; /* Status of per port memory support */
#define DPDK_PORT_WATCHDOG_INTERVAL 5
#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
#define OVS_VPORT_DPDK "ovs_dpdk"
/*
* need to reserve tons of extra space in the mbufs so we can align the
* DMA addresses to 4KB.
* The minimum mbuf size is limited to avoid scatter behaviour and drop in
* performance for standard Ethernet MTU.
*/
#define ETHER_HDR_MAX_LEN (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN \
+ (2 * VLAN_HEADER_LEN))
#define MTU_TO_FRAME_LEN(mtu) ((mtu) + RTE_ETHER_HDR_LEN + \
RTE_ETHER_CRC_LEN)
#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
- RTE_ETHER_HDR_LEN - RTE_ETHER_CRC_LEN)
#define NETDEV_DPDK_MBUF_ALIGN 1024
#define NETDEV_DPDK_MAX_PKT_LEN 9728
/* Max and min number of packets in the mempool. OVS tries to allocate a
* mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
* enough hugepages) we keep halving the number until the allocation succeeds
* or we reach MIN_NB_MBUF */
#define MAX_NB_MBUF (4096 * 64)
#define MIN_NB_MBUF (4096 * 4)
#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
== 0);
/* The smallest possible NB_MBUF that we're going to try should be a multiple
* of MP_CACHE_SZ. This is advised by DPDK documentation. */
BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
% MP_CACHE_SZ == 0);
#define SOCKET0 0
/* Default size of Physical NIC RXQ */
#define NIC_PORT_DEFAULT_RXQ_SIZE 2048
/* Default size of Physical NIC TXQ */
#define NIC_PORT_DEFAULT_TXQ_SIZE 2048
/* Maximum size of Physical NIC Queues */
#define NIC_PORT_MAX_Q_SIZE 4096
#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
#define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
* yet mapped to another queue. */
#define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
/* DPDK library uses uint16_t for port_id. */
typedef uint16_t dpdk_port_t;
#define DPDK_PORT_ID_FMT "%"PRIu16
/* Minimum amount of vhost tx retries, effectively a disable. */
#define VHOST_ENQ_RETRY_MIN 0
/* Maximum amount of vhost tx retries. */
#define VHOST_ENQ_RETRY_MAX 32
/* Legacy default value for vhost tx retries. */
#define VHOST_ENQ_RETRY_DEF 8
/* VDUSE-only, ignore for vhost-user. */
#define VHOST_MAX_QUEUE_PAIRS_MIN 1
#define VHOST_MAX_QUEUE_PAIRS_DEF VHOST_MAX_QUEUE_PAIRS_MIN
#define VHOST_MAX_QUEUE_PAIRS_MAX 128
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
/* List of required flags advertised by the hardware that will be used
* if TSO is enabled. Ideally this should include
* RTE_ETH_TX_OFFLOAD_SCTP_CKSUM. However, very few drivers support that
* at the moment and SCTP is not a widely used protocol like TCP and UDP,
* so it's optional. */
#define DPDK_TX_TSO_OFFLOAD_FLAGS (RTE_ETH_TX_OFFLOAD_TCP_TSO \
| RTE_ETH_TX_OFFLOAD_TCP_CKSUM \
| RTE_ETH_TX_OFFLOAD_UDP_CKSUM \
| RTE_ETH_TX_OFFLOAD_IPV4_CKSUM)
static const struct rte_eth_conf port_conf = {
.rxmode = {
.offloads = 0,
},
.rx_adv_conf = {
.rss_conf = {
.rss_key = NULL,
.rss_hf = RTE_ETH_RSS_IP | RTE_ETH_RSS_UDP | RTE_ETH_RSS_TCP,
},
},
.txmode = {
.mq_mode = RTE_ETH_MQ_TX_NONE,
},
};
/*
* These callbacks allow virtio-net devices to be added to vhost ports when
* configuration has been fully completed.
*/
static int new_device(int vid);
static void destroy_device(int vid);
static int vring_state_changed(int vid, uint16_t queue_id, int enable);
static void destroy_connection(int vid);
static const struct rte_vhost_device_ops virtio_net_device_ops =
{
.new_device = new_device,
.destroy_device = destroy_device,
.vring_state_changed = vring_state_changed,
.features_changed = NULL,
.new_connection = NULL,
.destroy_connection = destroy_connection,
};
/* Custom software stats for dpdk ports */
struct netdev_dpdk_sw_stats {
/* No. of retries when unable to transmit. */
uint64_t tx_retries;
/* Packet drops when unable to transmit; Probably Tx queue is full. */
uint64_t tx_failure_drops;
/* Packet length greater than device MTU. */
uint64_t tx_mtu_exceeded_drops;
/* Packet drops in egress policer processing. */
uint64_t tx_qos_drops;
/* Packet drops in ingress policer processing. */
uint64_t rx_qos_drops;
/* Packet drops in HWOL processing. */
uint64_t tx_invalid_hwol_drops;
};
enum dpdk_dev_type {
DPDK_DEV_ETH = 0,
DPDK_DEV_VHOST = 1,
};
/* Quality of Service */
/* An instance of a QoS configuration. Always associated with a particular
* network device.
*
* Each QoS implementation subclasses this with whatever additional data it
* needs.
*/
struct qos_conf {
const struct dpdk_qos_ops *ops;
rte_spinlock_t lock;
};
/* QoS queue information used by the netdev queue dump functions. */
struct netdev_dpdk_queue_state {
uint32_t *queues;
size_t cur_queue;
size_t n_queues;
};
/* A particular implementation of dpdk QoS operations.
*
* The functions below return 0 if successful or a positive errno value on
* failure, except where otherwise noted. All of them must be provided, except
* where otherwise noted.
*/
struct dpdk_qos_ops {
/* Name of the QoS type */
const char *qos_name;
/* Called to construct a qos_conf object. The implementation should make
* the appropriate calls to configure QoS according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* This function must return 0 if and only if it sets '*conf' to an
* initialized 'struct qos_conf'.
*
* For all QoS implementations it should always be non-null.
*/
int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
/* Destroys the data structures allocated by the implementation as part of
* 'qos_conf'.
*
* For all QoS implementations it should always be non-null.
*/
void (*qos_destruct)(struct qos_conf *conf);
/* Retrieves details of 'conf' configuration into 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*/
int (*qos_get)(const struct qos_conf *conf, struct smap *details);
/* Returns true if 'conf' is already configured according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* For all QoS implementations it should always be non-null.
*/
bool (*qos_is_equal)(const struct qos_conf *conf,
const struct smap *details);
/* Modify an array of rte_mbufs. The modification is specific to
* each qos implementation.
*
* The function should take and array of mbufs and an int representing
* the current number of mbufs present in the array.
*
* After the function has performed a qos modification to the array of
* mbufs it returns an int representing the number of mbufs now present in
* the array. This value is can then be passed to the port send function
* along with the modified array for transmission.
*
* For all QoS implementations it should always be non-null.
*/
int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
int pkt_cnt, bool should_steal);
/* Called to construct a QoS Queue. The implementation should make
* the appropriate calls to configure QoS Queue according to 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*
* This function must return 0 if and only if it constructs
* QoS queue successfully.
*/
int (*qos_queue_construct)(const struct smap *details,
uint32_t queue_id, struct qos_conf *conf);
/* Destroys the QoS Queue. */
void (*qos_queue_destruct)(struct qos_conf *conf, uint32_t queue_id);
/* Retrieves details of QoS Queue configuration into 'details'.
*
* The contents of 'details' should be documented as valid for 'ovs_name'
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
* (which is built as ovs-vswitchd.conf.db(8)).
*/
int (*qos_queue_get)(struct smap *details, uint32_t queue_id,
const struct qos_conf *conf);
/* Retrieves statistics of QoS Queue configuration into 'stats'. */
int (*qos_queue_get_stats)(const struct qos_conf *conf, uint32_t queue_id,
struct netdev_queue_stats *stats);
/* Setup the 'netdev_dpdk_queue_state' structure used by the dpdk queue
* dump functions.
*/
int (*qos_queue_dump_state_init)(const struct qos_conf *conf,
struct netdev_dpdk_queue_state *state);
};
/* dpdk_qos_ops for each type of user space QoS implementation. */
static const struct dpdk_qos_ops egress_policer_ops;
static const struct dpdk_qos_ops trtcm_policer_ops;
/*
* Array of dpdk_qos_ops, contains pointer to all supported QoS
* operations.
*/
static const struct dpdk_qos_ops *const qos_confs[] = {
&egress_policer_ops,
&trtcm_policer_ops,
NULL
};
static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
/* Contains all 'struct dpdk_dev's. */
static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
= OVS_LIST_INITIALIZER(&dpdk_list);
static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
= OVS_MUTEX_INITIALIZER;
/* Contains all 'struct dpdk_mp's. */
static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
= OVS_LIST_INITIALIZER(&dpdk_mp_list);
struct dpdk_mp {
struct rte_mempool *mp;
int mtu;
int socket_id;
int refcount;
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
};
struct user_mempool_config {
int adj_mtu;
int socket_id;
};
static struct user_mempool_config *user_mempools = NULL;
static int n_user_mempools;
/* There should be one 'struct dpdk_tx_queue' created for
* each netdev tx queue. */
struct dpdk_tx_queue {
/* Padding to make dpdk_tx_queue exactly one cache line long. */
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* Protects the members and the NIC queue from concurrent access.
* It is used only if the queue is shared among different pmd threads
* (see 'concurrent_txq'). */
rte_spinlock_t tx_lock;
/* Mapping of configured vhost-user queue to enabled by guest. */
int map;
);
};
struct ingress_policer {
struct rte_meter_srtcm_params app_srtcm_params;
struct rte_meter_srtcm in_policer;
struct rte_meter_srtcm_profile in_prof;
rte_spinlock_t policer_lock;
};
enum dpdk_hw_ol_features {
NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
NETDEV_RX_HW_CRC_STRIP = 1 << 1,
NETDEV_RX_HW_SCATTER = 1 << 2,
NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3,
NETDEV_TX_TCP_CKSUM_OFFLOAD = 1 << 4,
NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
NETDEV_TX_TSO_OFFLOAD = 1 << 7,
NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
NETDEV_TX_GRE_TNL_TSO_OFFLOAD = 1 << 12,
};
enum dpdk_rx_steer_flags {
DPDK_RX_STEER_LACP = 1 << 0,
};
/* Flags for the netdev_dpdk virtio_features_state field.
* This is used for the virtio features recovery mechanism linked to TSO
* support. */
#define OVS_VIRTIO_F_CLEAN (UINT8_C(1) << 0)
#define OVS_VIRTIO_F_WORKAROUND (UINT8_C(1) << 1)
#define OVS_VIRTIO_F_NEGOTIATED (UINT8_C(1) << 2)
#define OVS_VIRTIO_F_RECONF_PENDING (UINT8_C(1) << 3)
#define OVS_VIRTIO_F_CLEAN_NEGOTIATED \
(OVS_VIRTIO_F_CLEAN | OVS_VIRTIO_F_NEGOTIATED)
#define OVS_VIRTIO_F_WORKAROUND_NEGOTIATED \
(OVS_VIRTIO_F_WORKAROUND | OVS_VIRTIO_F_NEGOTIATED)
/*
* In order to avoid confusion in variables names, following naming convention
* should be used, if possible:
*
* 'struct netdev' : 'netdev'
* 'struct netdev_dpdk' : 'dev'
* 'struct netdev_rxq' : 'rxq'
* 'struct netdev_rxq_dpdk' : 'rx'
*
* Example:
* struct netdev *netdev = netdev_from_name(name);
* struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
*
* Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
* already defined.
*/
struct netdev_dpdk {
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
dpdk_port_t port_id;
/* If true, device was attached by rte_eth_dev_attach(). */
bool attached;
/* If true, rte_eth_dev_start() was successfully called. */
bool started;
/* If true, this is a port representor. */
bool is_representor;
struct eth_addr hwaddr;
/* 1 pad bytes here. */
int mtu;
int socket_id;
int buf_size;
int max_packet_len;
enum dpdk_dev_type type;
enum netdev_flags flags;
int link_reset_cnt;
union {
/* Device arguments for dpdk ports. */
char *devargs;
/* Identifier used to distinguish vhost devices from each other. */
char *vhost_id;
};
struct dpdk_tx_queue *tx_q;
struct rte_eth_link link;
);
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
struct dpdk_mp *dpdk_mp;
/* virtio identifier for vhost devices */
ovsrcu_index vid;
/* True if vHost device is 'up' and has been reconfigured at least once */
bool vhost_reconfigured;
atomic_uint8_t vhost_tx_retries_max;
/* Flags for virtio features recovery mechanism. */
uint8_t virtio_features_state;
/* 1 pad byte here. */
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
struct netdev up;
/* In dpdk_list. */
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
/* QoS configuration and lock for the device */
OVSRCU_TYPE(struct qos_conf *) qos_conf;
/* Ingress Policer */
OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
uint32_t policer_rate;
uint32_t policer_burst;
/* Array of vhost rxq states, see vring_state_changed. */
bool *vhost_rxq_enabled;
/* Ensures that Rx metadata delivery is configured only once. */
bool rx_metadata_delivery_configured;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
struct netdev_stats stats;
struct netdev_dpdk_sw_stats *sw_stats;
/* Protects stats */
rte_spinlock_t stats_lock;
/* 36 pad bytes here. */
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* The following properties cannot be changed when a device is running,
* so we remember the request and update them next time
* netdev_dpdk*_reconfigure() is called */
int requested_mtu;
int requested_n_txq;
/* User input for n_rxq (see dpdk_set_rxq_config). */
int user_n_rxq;
/* user_n_rxq + an optional rx steering queue (see
* netdev_dpdk_reconfigure). This field is different from the other
* requested_* fields as it may contain a different value than the user
* input. */
int requested_n_rxq;
int requested_rxq_size;
int requested_txq_size;
/* Number of rx/tx descriptors for physical devices */
int rxq_size;
int txq_size;
/* Socket ID detected when vHost device is brought up */
int requested_socket_id;
/* Ignored by DPDK for vhost-user backends, only for VDUSE. */
uint8_t vhost_max_queue_pairs;
/* Denotes whether vHost port is client/server mode */
uint64_t vhost_driver_flags;
/* DPDK-ETH Flow control */
struct rte_eth_fc_conf fc_conf;
/* DPDK-ETH hardware offload features,
* from the enum set 'dpdk_hw_ol_features' */
uint32_t hw_ol_features;
/* Properties for link state change detection mode.
* If lsc_interrupt_mode is set to false, poll mode is used,
* otherwise interrupt mode is used. */
bool requested_lsc_interrupt_mode;
bool lsc_interrupt_mode;
/* VF configuration. */
struct eth_addr requested_hwaddr;
/* Requested rx queue steering flags,
* from the enum set 'dpdk_rx_steer_flags'. */
uint64_t requested_rx_steer_flags;
uint64_t rx_steer_flags;
size_t rx_steer_flows_num;
struct rte_flow **rx_steer_flows;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
/* Names of all XSTATS counters */
struct rte_eth_xstat_name *rte_xstats_names;
int rte_xstats_names_size;
int rte_xstats_ids_size;
uint64_t *rte_xstats_ids;
);
};
struct netdev_rxq_dpdk {
struct netdev_rxq up;
dpdk_port_t port_id;
};
static void netdev_dpdk_destruct(struct netdev *netdev);
static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
static int netdev_dpdk_get_sw_custom_stats(const struct netdev *,
struct netdev_custom_stats *);
static void netdev_dpdk_configure_xstats(struct netdev_dpdk *dev);
static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
static void netdev_dpdk_mbuf_dump(const char *prefix, const char *message,
const struct rte_mbuf *);
static bool
is_dpdk_class(const struct netdev_class *class)
{
return class->destruct == netdev_dpdk_destruct
|| class->destruct == netdev_dpdk_vhost_destruct;
}
/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
* aligned at 1k or less. If a declared mbuf size is not a multiple of this
* value, insufficient buffers are allocated to accomodate the packet in its
* entirety. Furthermore, certain drivers need to ensure that there is also
* sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
* frames). If the RX buffer is too small, then the driver enables scatter RX
* behaviour, which reduces performance. To prevent this, use a buffer size
* that is closest to 'mtu', but which satisfies the aforementioned criteria.
*/
static uint32_t
dpdk_buf_size(int mtu)
{
return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu), NETDEV_DPDK_MBUF_ALIGN)
+ RTE_PKTMBUF_HEADROOM;
}
static int
dpdk_get_user_adjusted_mtu(int port_adj_mtu, int port_mtu, int port_socket_id)
{
int best_adj_user_mtu = INT_MAX;
for (unsigned i = 0; i < n_user_mempools; i++) {
int user_adj_mtu, user_socket_id;
user_adj_mtu = user_mempools[i].adj_mtu;
user_socket_id = user_mempools[i].socket_id;
if (port_adj_mtu > user_adj_mtu
|| (user_socket_id != INT_MAX
&& user_socket_id != port_socket_id)) {
continue;
}
if (user_adj_mtu < best_adj_user_mtu) {
/* This is the is the lowest valid user MTU. */
best_adj_user_mtu = user_adj_mtu;
if (best_adj_user_mtu == port_adj_mtu) {
/* Found an exact fit, no need to keep searching. */
break;
}
}
}
if (best_adj_user_mtu == INT_MAX) {
VLOG_DBG("No user configured shared mempool mbuf sizes found "
"suitable for port with MTU %d, NUMA %d.", port_mtu,
port_socket_id);
best_adj_user_mtu = port_adj_mtu;
} else {
VLOG_DBG("Found user configured shared mempool with mbufs "
"of size %d, suitable for port with MTU %d, NUMA %d.",
MTU_TO_FRAME_LEN(best_adj_user_mtu), port_mtu,
port_socket_id);
}
return best_adj_user_mtu;
}
/* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
*
* Unlike xmalloc(), this function can return NULL on failure. */
static void *
dpdk_rte_mzalloc(size_t sz)
{
return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
}
void
free_dpdk_buf(struct dp_packet *p)
{
struct rte_mbuf *pkt = (struct rte_mbuf *) p;
rte_pktmbuf_free(pkt);
}
static void
ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
void *opaque_arg OVS_UNUSED,
void *_p,
unsigned i OVS_UNUSED)
{
struct rte_mbuf *pkt = _p;
dp_packet_init_dpdk((struct dp_packet *) pkt);
}
static int
dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
{
/* At this point we want to know if all the mbufs are back
* in the mempool. rte_mempool_full() is not atomic but it's
* the best available and as we are no longer requesting mbufs
* from the mempool, it means mbufs will not move from
* 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
* the ring is counted before caches, so we won't get false
* positives in this use case and we handle false negatives.
*
* If future implementations of rte_mempool_full() were to change
* it could be possible for a false positive. Even that would
* likely be ok, as there are additional checks during mempool
* freeing but it would make things racey.
*/
return rte_mempool_full(mp);
}
/* Free unused mempools. */
static void
dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
{
struct dpdk_mp *dmp;
LIST_FOR_EACH_SAFE (dmp, list_node, &dpdk_mp_list) {
if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
ovs_list_remove(&dmp->list_node);
rte_mempool_free(dmp->mp);
rte_free(dmp);
}
}
}
/* Calculating the required number of mbufs differs depending on the
* mempool model being used. Check if per port memory is in use before
* calculating.
*/
static uint32_t
dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu)
{
uint32_t n_mbufs;
if (!per_port_memory) {
/* Shared memory are being used.
* XXX: this is a really rough method of provisioning memory.
* It's impossible to determine what the exact memory requirements are
* when the number of ports and rxqs that utilize a particular mempool
* can change dynamically at runtime. For now, use this rough
* heurisitic.
*/
if (mtu >= RTE_ETHER_MTU) {
n_mbufs = MAX_NB_MBUF;
} else {
n_mbufs = MIN_NB_MBUF;
}
} else {
/* Per port memory is being used.
* XXX: rough estimation of number of mbufs required for this port:
* <packets required to fill the device rxqs>
* + <packets that could be stuck on other ports txqs>
* + <packets in the pmd threads>
* + <additional memory for corner cases>
*/
n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
+ dev->requested_n_txq * dev->requested_txq_size
+ MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
+ MIN_NB_MBUF;
}
return n_mbufs;
}
static struct dpdk_mp *
dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
{
char mp_name[RTE_MEMPOOL_NAMESIZE];
const char *netdev_name = netdev_get_name(&dev->up);
int socket_id = dev->requested_socket_id;
uint32_t n_mbufs = 0;
uint32_t mbuf_size = 0;
uint32_t aligned_mbuf_size = 0;
uint32_t mbuf_priv_data_len = 0;
uint32_t pkt_size = 0;
uint32_t hash = hash_string(netdev_name, 0);
struct dpdk_mp *dmp = NULL;
int ret;
dmp = dpdk_rte_mzalloc(sizeof *dmp);
if (!dmp) {
return NULL;
}
dmp->socket_id = socket_id;
dmp->mtu = mtu;
dmp->refcount = 1;
/* Get the size of each mbuf, based on the MTU */
mbuf_size = MTU_TO_FRAME_LEN(mtu);
n_mbufs = dpdk_calculate_mbufs(dev, mtu);
do {
/* Full DPDK memory pool name must be unique and cannot be
* longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
* mempool case this can result in one device using a mempool
* which references a different device in it's name. However as
* mempool names are hashed, the device name will not be readable
* so this is not an issue for tasks such as debugging.
*/
ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
"ovs%08x%02d%05d%07u",
hash, socket_id, mtu, n_mbufs);
if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
VLOG_DBG("snprintf returned %d. "
"Failed to generate a mempool name for \"%s\". "
"Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
ret, netdev_name, hash, socket_id, mtu, n_mbufs);
break;
}
VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
"on socket %d for %d Rx and %d Tx queues, "
"cache line size of %u",
netdev_name, n_mbufs, mbuf_size, socket_id,
dev->requested_n_rxq, dev->requested_n_txq,
RTE_CACHE_LINE_SIZE);
/* The size of the mbuf's private area (i.e. area that holds OvS'
* dp_packet data)*/
mbuf_priv_data_len = sizeof(struct dp_packet) -
sizeof(struct rte_mbuf);
/* The size of the entire dp_packet. */
pkt_size = sizeof(struct dp_packet) + mbuf_size;
/* mbuf size, rounded up to cacheline size. */
aligned_mbuf_size = ROUND_UP(pkt_size, RTE_CACHE_LINE_SIZE);
/* If there is a size discrepancy, add padding to mbuf_priv_data_len.
* This maintains mbuf size cache alignment, while also honoring RX
* buffer alignment in the data portion of the mbuf. If this adjustment
* is not made, there is a possiblity later on that for an element of
* the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
* This is problematic in the case of multi-segment mbufs, particularly
* when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
* header, for example.
*/
mbuf_priv_data_len += (aligned_mbuf_size - pkt_size);
dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
mbuf_priv_data_len,
mbuf_size,
socket_id);
if (dmp->mp) {
VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
mp_name, n_mbufs);
/* rte_pktmbuf_pool_create has done some initialization of the
* rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
* initializes some OVS specific fields of dp_packet.
*/
rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
return dmp;
} else if (rte_errno == EEXIST) {
/* A mempool with the same name already exists. We just
* retrieve its pointer to be returned to the caller. */
dmp->mp = rte_mempool_lookup(mp_name);
/* As the mempool create returned EEXIST we can expect the
* lookup has returned a valid pointer. If for some reason
* that's not the case we keep track of it. */
VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
mp_name, dmp->mp);
return dmp;
} else {
VLOG_DBG("Failed to create mempool \"%s\" with a request of "
"%u mbufs, retrying with %u mbufs",
mp_name, n_mbufs, n_mbufs / 2);
}
} while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
mp_name, n_mbufs);
rte_free(dmp);
return NULL;
}
static struct dpdk_mp *
dpdk_mp_get(struct netdev_dpdk *dev, int mtu)
{
struct dpdk_mp *dmp = NULL, *next;
bool reuse = false;
ovs_mutex_lock(&dpdk_mp_mutex);
/* Check if shared memory is being used, if so check existing mempools
* to see if reuse is possible. */
if (!per_port_memory) {
/* If user has provided defined mempools, check if one is suitable
* and get new buffer size.*/
mtu = dpdk_get_user_adjusted_mtu(mtu, dev->requested_mtu,
dev->requested_socket_id);
LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
if (dmp->socket_id == dev->requested_socket_id
&& dmp->mtu == mtu) {
VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
dmp->refcount++;
reuse = true;
break;
}
}
}
/* Sweep mempools after reuse or before create. */
dpdk_mp_sweep();
if (!reuse) {
dmp = dpdk_mp_create(dev, mtu);
if (dmp) {
/* Shared memory will hit the reuse case above so will not
* request a mempool that already exists but we need to check
* for the EEXIST case for per port memory case. Compare the
* mempool returned by dmp to each entry in dpdk_mp_list. If a
* match is found, free dmp as a new entry is not required, set
* dmp to point to the existing entry and increment the refcount
* to avoid being freed at a later stage.
*/
if (per_port_memory && rte_errno == EEXIST) {
LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
if (dmp->mp == next->mp) {
rte_free(dmp);
dmp = next;
dmp->refcount++;
}
}
} else {
ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
}
}
}
ovs_mutex_unlock(&dpdk_mp_mutex);
return dmp;
}
/* Decrement reference to a mempool. */
static void
dpdk_mp_put(struct dpdk_mp *dmp)
{
if (!dmp) {
return;
}
ovs_mutex_lock(&dpdk_mp_mutex);
ovs_assert(dmp->refcount);
dmp->refcount--;
ovs_mutex_unlock(&dpdk_mp_mutex);
}
/* Depending on the memory model being used this function tries to
* identify and reuse an existing mempool or tries to allocate a new
* mempool on requested_socket_id with mbuf size corresponding to the
* requested_mtu. On success, a new configuration will be applied.
* On error, device will be left unchanged. */
static int
netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
struct dpdk_mp *dmp;
int ret = 0;
/* With shared memory we do not need to configure a mempool if the MTU
* and socket ID have not changed, the previous configuration is still
* valid so return 0 */
if (!per_port_memory && dev->mtu == dev->requested_mtu
&& dev->socket_id == dev->requested_socket_id) {
return ret;
}
dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size));
if (!dmp) {
VLOG_ERR("Failed to create memory pool for netdev "
"%s, with MTU %d on socket %d: %s\n",
dev->up.name, dev->requested_mtu, dev->requested_socket_id,
rte_strerror(rte_errno));
ret = rte_errno;
} else {
/* Check for any pre-existing dpdk_mp for the device before accessing
* the associated mempool.
*/
if (dev->dpdk_mp != NULL) {
/* A new MTU was requested, decrement the reference count for the
* devices current dpdk_mp. This is required even if a pointer to
* same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
* has already been incremented by dpdk_mp_get at this stage so it
* must be decremented to keep an accurate refcount for the
* dpdk_mp.
*/
dpdk_mp_put(dev->dpdk_mp);
}
dev->dpdk_mp = dmp;
dev->mtu = dev->requested_mtu;
dev->socket_id = dev->requested_socket_id;
dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
}
return ret;
}
static void
check_link_status(struct netdev_dpdk *dev)
{
struct rte_eth_link link;
if (rte_eth_link_get_nowait(dev->port_id, &link) < 0) {
VLOG_DBG_RL(&rl,
"Failed to retrieve link status for port "DPDK_PORT_ID_FMT,
dev->port_id);
return;
}
if (dev->link.link_status != link.link_status) {
netdev_change_seq_changed(&dev->up);
dev->link_reset_cnt++;
dev->link = link;
if (dev->link.link_status) {
VLOG_DBG_RL(&rl,
"Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
dev->port_id, (unsigned) dev->link.link_speed,
(dev->link.link_duplex == RTE_ETH_LINK_FULL_DUPLEX)
? "full-duplex" : "half-duplex");
} else {
VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
dev->port_id);
}
}
}
static void *
dpdk_watchdog(void *dummy OVS_UNUSED)
{
struct netdev_dpdk *dev;
pthread_detach(pthread_self());
for (;;) {
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (dev->type == DPDK_DEV_ETH) {
check_link_status(dev);
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
}
return NULL;
}
static void
netdev_dpdk_update_netdev_flag(struct netdev_dpdk *dev,
enum dpdk_hw_ol_features hw_ol_features,
enum netdev_ol_flags flag)
OVS_REQUIRES(dev->mutex)
{
struct netdev *netdev = &dev->up;
if (dev->hw_ol_features & hw_ol_features) {
netdev->ol_flags |= flag;
} else {
netdev->ol_flags &= ~flag;
}
}
static void
netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_IPV4_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_IPV4_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TCP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_TCP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_UDP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_UDP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_SCTP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_SCTP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
NETDEV_TX_OFFLOAD_TCP_TSO);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
NETDEV_TX_VXLAN_TNL_TSO);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GRE_TNL_TSO_OFFLOAD,
NETDEV_TX_GRE_TNL_TSO);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
NETDEV_TX_GENEVE_TNL_TSO);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
}
static int
dpdk_eth_dev_port_config(struct netdev_dpdk *dev,
const struct rte_eth_dev_info *info,
int n_rxq, int n_txq)
{
struct rte_eth_conf conf = port_conf;
uint16_t conf_mtu;
int diag = 0;
int i;
/* As of DPDK 17.11.1 a few PMDs require to explicitly enable
* scatter to support jumbo RX.
* Setting scatter for the device is done after checking for
* scatter support in the device capabilites. */
if (dev->mtu > RTE_ETHER_MTU) {
if (dev->hw_ol_features & NETDEV_RX_HW_SCATTER) {
conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_SCATTER;
}
}
conf.intr_conf.lsc = dev->lsc_interrupt_mode;
if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_CHECKSUM;
}
if (!(dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP)
&& info->rx_offload_capa & RTE_ETH_RX_OFFLOAD_KEEP_CRC) {
conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_KEEP_CRC;
}
if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_TCP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_UDP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_SCTP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
}
if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
}
if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
}
if (dev->hw_ol_features & NETDEV_TX_GRE_TNL_TSO_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO;
}
if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
}
if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
}
/* Limit configured rss hash functions to only those supported
* by the eth device. */
conf.rx_adv_conf.rss_conf.rss_hf &= info->flow_type_rss_offloads;
if (conf.rx_adv_conf.rss_conf.rss_hf == 0) {
conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE;
} else {
conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
}
/* A device may report more queues than it makes available (this has
* been observed for Intel xl710, which reserves some of them for
* SRIOV): rte_eth_*_queue_setup will fail if a queue is not
* available. When this happens we can retry the configuration
* and request less queues */
while (n_rxq && n_txq) {
if (diag) {
VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
}
diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
if (diag) {
VLOG_WARN("Interface %s eth_dev setup error %s\n",
dev->up.name, rte_strerror(-diag));
break;
}
diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
if (diag) {
/* A device may not support rte_eth_dev_set_mtu, in this case
* flag a warning to the user and include the devices configured
* MTU value that will be used instead. */
if (-ENOTSUP == diag) {
rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
VLOG_WARN("Interface %s does not support MTU configuration, "
"max packet size supported is %"PRIu16".",
dev->up.name, conf_mtu);
} else {
VLOG_ERR("Interface %s MTU (%d) setup error: %s",
dev->up.name, dev->mtu, rte_strerror(-diag));
break;
}
}
for (i = 0; i < n_txq; i++) {
diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
dev->socket_id, NULL);
if (diag) {
VLOG_INFO("Interface %s unable to setup txq(%d): %s",
dev->up.name, i, rte_strerror(-diag));
break;
}
}
if (i != n_txq) {
/* Retry with less tx queues */
n_txq = i;
continue;
}
for (i = 0; i < n_rxq; i++) {
diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
dev->socket_id, NULL,
dev->dpdk_mp->mp);
if (diag) {
VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
dev->up.name, i, rte_strerror(-diag));
break;
}
}
if (i != n_rxq) {
/* Retry with less rx queues */
n_rxq = i;
continue;
}
dev->up.n_rxq = n_rxq;
dev->up.n_txq = n_txq;
return 0;
}
return diag;
}
static void
dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
{
if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
dev->port_id);
}
}
static void
dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev)
{
uint64_t rx_metadata = 0;
int ret;
if (dev->rx_metadata_delivery_configured) {
return;
}
/* For the fallback offload (non-"transfer" rules). */
rx_metadata |= RTE_ETH_RX_METADATA_USER_MARK;
#ifdef ALLOW_EXPERIMENTAL_API
/* For the tunnel offload. */
rx_metadata |= RTE_ETH_RX_METADATA_TUNNEL_ID;
#endif /* ALLOW_EXPERIMENTAL_API */
ret = rte_eth_rx_metadata_negotiate(dev->port_id, &rx_metadata);
if (ret == 0) {
if (!(rx_metadata & RTE_ETH_RX_METADATA_USER_MARK)) {
VLOG_DBG("%s: The NIC will not provide per-packet USER_MARK",
netdev_get_name(&dev->up));
}
#ifdef ALLOW_EXPERIMENTAL_API
if (!(rx_metadata & RTE_ETH_RX_METADATA_TUNNEL_ID)) {
VLOG_DBG("%s: The NIC will not provide per-packet TUNNEL_ID",
netdev_get_name(&dev->up));
}
#endif /* ALLOW_EXPERIMENTAL_API */
} else {
VLOG(ret == -ENOTSUP ? VLL_DBG : VLL_WARN,
"%s: Cannot negotiate Rx metadata: %s",
netdev_get_name(&dev->up), rte_strerror(-ret));
}
dev->rx_metadata_delivery_configured = true;
}
static int
dpdk_eth_dev_init(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
struct rte_pktmbuf_pool_private *mbp_priv;
struct rte_eth_dev_info info;
struct rte_ether_addr eth_addr;
int diag;
int n_rxq, n_txq;
uint32_t rx_chksm_offload_capa = RTE_ETH_RX_OFFLOAD_UDP_CKSUM |
RTE_ETH_RX_OFFLOAD_TCP_CKSUM |
RTE_ETH_RX_OFFLOAD_IPV4_CKSUM;
if (netdev_is_flow_api_enabled()) {
/*
* Full tunnel offload requires that tunnel ID metadata be
* delivered with "miss" packets from the hardware to the
* PMD. The same goes for megaflow mark metadata which is
* used in MARK + RSS offload scenario.
*
* Request delivery of such metadata.
*/
dpdk_eth_dev_init_rx_metadata(dev);
}
diag = rte_eth_dev_info_get(dev->port_id, &info);
if (diag < 0) {
VLOG_ERR("Interface %s rte_eth_dev_info_get error: %s",
dev->up.name, rte_strerror(-diag));
return -diag;
}
dev->is_representor = !!(*info.dev_flags & RTE_ETH_DEV_REPRESENTOR);
if (strstr(info.driver_name, "vf") != NULL) {
VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
} else {
dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
}
if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
rx_chksm_offload_capa) {
VLOG_WARN("Rx checksum offload is not supported on port "
DPDK_PORT_ID_FMT, dev->port_id);
dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
} else {
dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
}
if (info.rx_offload_capa & RTE_ETH_RX_OFFLOAD_SCATTER) {
dev->hw_ol_features |= NETDEV_RX_HW_SCATTER;
} else {
/* Do not warn on lack of scatter support */
dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER;
}
if (!strcmp(info.driver_name, "net_tap")) {
/* FIXME: L4 checksum offloading is broken in DPDK net/tap driver.
* This workaround can be removed once the fix makes it to a DPDK
* LTS release used by OVS. */
VLOG_INFO("%s: disabled Tx L4 checksum offloads for a net/tap port.",
netdev_get_name(&dev->up));
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_UDP_CKSUM;
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
}
if (!strcmp(info.driver_name, "net_txgbe")) {
/* FIXME: Driver advertises the capability but doesn't seem
* to actually support it correctly. Can remove this once
* the driver is fixed on DPDK side. */
VLOG_INFO("%s: disabled Tx outer udp checksum offloads for a "
"net/txgbe port.",
netdev_get_name(&dev->up));
info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_TCP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_UDP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_UDP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
} else {
dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
}
dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
if (userspace_tso_enabled()) {
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) {
dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
} else {
VLOG_WARN("%s: Tx TSO offload is not supported.",
netdev_get_name(&dev->up));
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
} else {
VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
netdev_get_name(&dev->up));
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
} else {
VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
netdev_get_name(&dev->up));
}
if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO) {
dev->hw_ol_features |= NETDEV_TX_GRE_TNL_TSO_OFFLOAD;
} else {
VLOG_WARN("%s: Tx GRE tunnel TSO offload is not supported.",
netdev_get_name(&dev->up));
}
}
n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
diag = dpdk_eth_dev_port_config(dev, &info, n_rxq, n_txq);
if (diag) {
VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
"configure error: %s",
dev->up.name, n_rxq, n_txq,
dev->lsc_interrupt_mode ? "true" : "false",
rte_strerror(-diag));
return -diag;
}
diag = rte_eth_dev_start(dev->port_id);
if (diag) {
VLOG_ERR("Interface %s start error: %s", dev->up.name,
rte_strerror(-diag));
return -diag;
}
dev->started = true;
netdev_dpdk_configure_xstats(dev);
rte_eth_promiscuous_enable(dev->port_id);
rte_eth_allmulticast_enable(dev->port_id);
memset(&eth_addr, 0x0, sizeof(eth_addr));
rte_eth_macaddr_get(dev->port_id, &eth_addr);
VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
if (rte_eth_link_get_nowait(dev->port_id, &dev->link) < 0) {
memset(&dev->link, 0, sizeof dev->link);
}
mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
return 0;
}
static struct netdev_dpdk *
netdev_dpdk_cast(const struct netdev *netdev)
{
return CONTAINER_OF(netdev, struct netdev_dpdk, up);
}
static struct netdev *
netdev_dpdk_alloc(void)
{
struct netdev_dpdk *dev;
dev = dpdk_rte_mzalloc(sizeof *dev);
if (dev) {
return &dev->up;
}
return NULL;
}
static struct dpdk_tx_queue *
netdev_dpdk_alloc_txq(unsigned int n_txqs)
{
struct dpdk_tx_queue *txqs;
unsigned i;
txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
if (txqs) {
for (i = 0; i < n_txqs; i++) {
/* Initialize map for vhost devices. */
txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
rte_spinlock_init(&txqs[i].tx_lock);
}
}
return txqs;
}
static int
common_construct(struct netdev *netdev, dpdk_port_t port_no,
enum dpdk_dev_type type, int socket_id)
OVS_REQUIRES(dpdk_mutex)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_init(&dev->mutex);
rte_spinlock_init(&dev->stats_lock);
/* If the 'sid' is negative, it means that the kernel fails
* to obtain the pci numa info. In that situation, always
* use 'SOCKET0'. */
dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
dev->requested_socket_id = dev->socket_id;
dev->port_id = port_no;
dev->type = type;
dev->flags = 0;
dev->requested_mtu = RTE_ETHER_MTU;
dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
dev->requested_lsc_interrupt_mode = 0;
ovsrcu_index_init(&dev->vid, -1);
dev->vhost_reconfigured = false;
dev->virtio_features_state = OVS_VIRTIO_F_CLEAN;
dev->attached = false;
dev->started = false;
ovsrcu_init(&dev->qos_conf, NULL);
ovsrcu_init(&dev->ingress_policer, NULL);
dev->policer_rate = 0;
dev->policer_burst = 0;
netdev->n_rxq = 0;
netdev->n_txq = 0;
dev->user_n_rxq = NR_QUEUE;
dev->requested_n_rxq = NR_QUEUE;
dev->requested_n_txq = NR_QUEUE;
dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
dev->requested_rx_steer_flags = 0;
dev->rx_steer_flags = 0;
dev->rx_steer_flows_num = 0;
dev->rx_steer_flows = NULL;
/* Initialize the flow control to NULL */
memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
/* Initilize the hardware offload flags to 0 */
dev->hw_ol_features = 0;
dev->rx_metadata_delivery_configured = false;
dev->flags = NETDEV_UP | NETDEV_PROMISC;
ovs_list_push_back(&dpdk_list, &dev->list_node);
netdev_request_reconfigure(netdev);
dev->rte_xstats_names = NULL;
dev->rte_xstats_names_size = 0;
dev->rte_xstats_ids = NULL;
dev->rte_xstats_ids_size = 0;
dev->sw_stats = xzalloc(sizeof *dev->sw_stats);
dev->sw_stats->tx_retries = (dev->type == DPDK_DEV_VHOST) ? 0 : UINT64_MAX;
return 0;
}
static int
vhost_common_construct(struct netdev *netdev)
OVS_REQUIRES(dpdk_mutex)
{
int socket_id = rte_lcore_to_socket_id(rte_get_main_lcore());
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
sizeof *dev->vhost_rxq_enabled);
if (!dev->vhost_rxq_enabled) {
return ENOMEM;
}
dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
if (!dev->tx_q) {
rte_free(dev->vhost_rxq_enabled);
return ENOMEM;
}
atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
dev->vhost_max_queue_pairs = VHOST_MAX_QUEUE_PAIRS_DEF;
return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
DPDK_DEV_VHOST, socket_id);
}
static int
netdev_dpdk_vhost_construct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const char *name = netdev->name;
int err;
/* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
* the file system. '/' or '\' would traverse directories, so they're not
* acceptable in 'name'. */
if (strchr(name, '/') || strchr(name, '\\')) {
VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
"A valid name must not include '/' or '\\'",
name);
return EINVAL;
}
ovs_mutex_lock(&dpdk_mutex);
/* Take the name of the vhost-user port and append it to the location where
* the socket is to be created, then register the socket.
*/
dev->vhost_id = xasprintf("%s/%s", vhost_sock_dir, name);
dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
/* There is no support for multi-segments buffers. */
dev->vhost_driver_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
if (err) {
VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
dev->vhost_id);
goto out;
} else {
fatal_signal_add_file_to_unlink(dev->vhost_id);
VLOG_INFO("Socket %s created for vhost-user port %s\n",
dev->vhost_id, name);
}
err = rte_vhost_driver_callback_register(dev->vhost_id,
&virtio_net_device_ops);
if (err) {
VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
"port: %s\n", name);
goto out;
}
if (!userspace_tso_enabled()) {
err = rte_vhost_driver_disable_features(dev->vhost_id,
1ULL << VIRTIO_NET_F_HOST_TSO4
| 1ULL << VIRTIO_NET_F_HOST_TSO6
| 1ULL << VIRTIO_NET_F_CSUM);
if (err) {
VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
"port: %s\n", name);
goto out;
}
}
err = rte_vhost_driver_start(dev->vhost_id);
if (err) {
VLOG_ERR("rte_vhost_driver_start failed for vhost user "
"port: %s\n", name);
goto out;
}
err = vhost_common_construct(netdev);
if (err) {
VLOG_ERR("vhost_common_construct failed for vhost user "
"port: %s\n", name);
}
out:
if (err) {
free(dev->vhost_id);
dev->vhost_id = NULL;
}
ovs_mutex_unlock(&dpdk_mutex);
VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
"please migrate to dpdkvhostuserclient ports.");
return err;
}
static int
netdev_dpdk_vhost_client_construct(struct netdev *netdev)
{
int err;
ovs_mutex_lock(&dpdk_mutex);
err = vhost_common_construct(netdev);
if (err) {
VLOG_ERR("vhost_common_construct failed for vhost user client"
"port: %s\n", netdev->name);
}
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static int
netdev_dpdk_construct(struct netdev *netdev)
{
int err;
ovs_mutex_lock(&dpdk_mutex);
err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
DPDK_DEV_ETH, SOCKET0);
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static void
common_destruct(struct netdev_dpdk *dev)
OVS_REQUIRES(dpdk_mutex)
OVS_EXCLUDED(dev->mutex)
{
rte_free(dev->tx_q);
dpdk_mp_put(dev->dpdk_mp);
ovs_list_remove(&dev->list_node);
free(ovsrcu_get_protected(struct ingress_policer *,
&dev->ingress_policer));
free(dev->sw_stats);
ovs_mutex_destroy(&dev->mutex);
}
static void dpdk_rx_steer_unconfigure(struct netdev_dpdk *);
static void
netdev_dpdk_destruct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dpdk_mutex);
/* Destroy any rx-steering flows to allow RXQs to be removed. */
dpdk_rx_steer_unconfigure(dev);
rte_eth_dev_stop(dev->port_id);
dev->started = false;
if (dev->attached) {
bool dpdk_resources_still_used = false;
struct rte_eth_dev_info dev_info;
dpdk_port_t sibling_port_id;
int diag;
/* Check if this netdev has siblings (i.e. shares DPDK resources) among
* other OVS netdevs. */
RTE_ETH_FOREACH_DEV_SIBLING (sibling_port_id, dev->port_id) {
struct netdev_dpdk *sibling;
/* RTE_ETH_FOREACH_DEV_SIBLING lists dev->port_id as part of the
* loop. */
if (sibling_port_id == dev->port_id) {
continue;
}
LIST_FOR_EACH (sibling, list_node, &dpdk_list) {
if (sibling->port_id != sibling_port_id) {
continue;
}
dpdk_resources_still_used = true;
break;
}
if (dpdk_resources_still_used) {
break;
}
}
/* Retrieve eth device data before closing it. */
diag = rte_eth_dev_info_get(dev->port_id, &dev_info);
/* Remove the eth device. */
rte_eth_dev_close(dev->port_id);
/* Remove the rte device if no associated eth device is used by OVS.
* Note: any remaining eth devices associated to this rte device are
* closed by DPDK ethdev layer. */
if (!dpdk_resources_still_used) {
if (!diag) {
diag = rte_dev_remove(dev_info.device);
}
if (diag < 0) {
VLOG_ERR("Device '%s' can not be detached: %s.",
dev->devargs, rte_strerror(-diag));
} else {
/* Device was closed and detached. */
VLOG_INFO("Device '%s' has been removed and detached",
dev->devargs);
}
} else {
/* Device was only closed. rte_dev_remove() was not called. */
VLOG_INFO("Device '%s' has been removed", dev->devargs);
}
}
netdev_dpdk_clear_xstats(dev);
free(dev->devargs);
common_destruct(dev);
ovs_mutex_unlock(&dpdk_mutex);
}
/* rte_vhost_driver_unregister() can call back destroy_device(), which will
* try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
* deadlock, none of the mutexes must be held while calling this function. */
static int
dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
char *vhost_id)
OVS_EXCLUDED(dpdk_mutex)
OVS_EXCLUDED(dev->mutex)
{
return rte_vhost_driver_unregister(vhost_id);
}
static void
netdev_dpdk_vhost_destruct(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
char *vhost_id;
ovs_mutex_lock(&dpdk_mutex);
/* Guest becomes an orphan if still attached. */
if (netdev_dpdk_get_vid(dev) >= 0
&& !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
VLOG_ERR("Removing port '%s' while vhost device still attached.",
netdev->name);
VLOG_ERR("To restore connectivity after re-adding of port, VM on "
"socket '%s' must be restarted.", dev->vhost_id);
}
vhost_id = dev->vhost_id;
dev->vhost_id = NULL;
rte_free(dev->vhost_rxq_enabled);
common_destruct(dev);
ovs_mutex_unlock(&dpdk_mutex);
if (!vhost_id) {
goto out;
}
if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
netdev->name, vhost_id);
} else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
/* OVS server mode - remove this socket from list for deletion */
fatal_signal_remove_file_to_unlink(vhost_id);
}
out:
free(vhost_id);
}
static void
netdev_dpdk_dealloc(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
rte_free(dev);
}
static void
netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
free(dev->rte_xstats_names);
dev->rte_xstats_names = NULL;
dev->rte_xstats_names_size = 0;
free(dev->rte_xstats_ids);
dev->rte_xstats_ids = NULL;
dev->rte_xstats_ids_size = 0;
}
static const char *
netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
OVS_REQUIRES(dev->mutex)
{
if (id >= dev->rte_xstats_names_size) {
return "UNKNOWN";
}
return dev->rte_xstats_names[id].name;
}
static bool
is_queue_stat(const char *s)
{
uint16_t tmp;
return (s[0] == 'r' || s[0] == 't') &&
(ovs_scan(s + 1, "x_q%"SCNu16"_packets", &tmp) ||
ovs_scan(s + 1, "x_q%"SCNu16"_bytes", &tmp));
}
static void
netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
struct rte_eth_xstat_name *rte_xstats_names = NULL;
struct rte_eth_xstat *rte_xstats = NULL;
int rte_xstats_names_size;
int rte_xstats_len;
const char *name;
uint64_t id;
netdev_dpdk_clear_xstats(dev);
rte_xstats_names_size = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
if (rte_xstats_names_size < 0) {
VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
rte_xstats_names = xcalloc(rte_xstats_names_size,
sizeof *rte_xstats_names);
rte_xstats_len = rte_eth_xstats_get_names(dev->port_id,
rte_xstats_names,
rte_xstats_names_size);
if (rte_xstats_len < 0 || rte_xstats_len != rte_xstats_names_size) {
VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
rte_xstats = xcalloc(rte_xstats_names_size, sizeof *rte_xstats);
rte_xstats_len = rte_eth_xstats_get(dev->port_id, rte_xstats,
rte_xstats_names_size);
if (rte_xstats_len < 0 || rte_xstats_len != rte_xstats_names_size) {
VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
dev->rte_xstats_names = rte_xstats_names;
rte_xstats_names = NULL;
dev->rte_xstats_names_size = rte_xstats_names_size;
dev->rte_xstats_ids = xcalloc(rte_xstats_names_size,
sizeof *dev->rte_xstats_ids);
for (unsigned int i = 0; i < rte_xstats_names_size; i++) {
id = rte_xstats[i].id;
name = netdev_dpdk_get_xstat_name(dev, id);
/* For custom stats, we filter out everything except per rxq/txq basic
* stats, and dropped, error and management counters. */
if (is_queue_stat(name) ||
string_ends_with(name, "_errors") ||
strstr(name, "_management_") ||
string_ends_with(name, "_dropped")) {
dev->rte_xstats_ids[dev->rte_xstats_ids_size] = id;
dev->rte_xstats_ids_size++;
}
}
out:
free(rte_xstats);
free(rte_xstats_names);
}
static int
netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->devargs && dev->devargs[0]) {
smap_add_format(args, "dpdk-devargs", "%s", dev->devargs);
}
smap_add_format(args, "n_rxq", "%d", dev->user_n_rxq);
if (dev->fc_conf.mode == RTE_ETH_FC_TX_PAUSE ||
dev->fc_conf.mode == RTE_ETH_FC_FULL) {
smap_add(args, "rx-flow-ctrl", "true");
}
if (dev->fc_conf.mode == RTE_ETH_FC_RX_PAUSE ||
dev->fc_conf.mode == RTE_ETH_FC_FULL) {
smap_add(args, "tx-flow-ctrl", "true");
}
if (dev->fc_conf.autoneg) {
smap_add(args, "flow-ctrl-autoneg", "true");
}
smap_add_format(args, "n_rxq_desc", "%d", dev->rxq_size);
smap_add_format(args, "n_txq_desc", "%d", dev->txq_size);
if (dev->rx_steer_flags == DPDK_RX_STEER_LACP) {
smap_add(args, "rx-steering", "rss+lacp");
}
smap_add(args, "dpdk-lsc-interrupt",
dev->lsc_interrupt_mode ? "true" : "false");
if (dev->is_representor) {
smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT,
ETH_ADDR_ARGS(dev->requested_hwaddr));
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static struct netdev_dpdk *
netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
OVS_REQUIRES(dpdk_mutex)
{
struct netdev_dpdk *dev;
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (dev->port_id == port_id) {
return dev;
}
}
return NULL;
}
static dpdk_port_t
netdev_dpdk_get_port_by_mac(const char *mac_str)
{
dpdk_port_t port_id;
struct eth_addr mac, port_mac;
if (!eth_addr_from_string(mac_str, &mac)) {
VLOG_ERR("invalid mac: %s", mac_str);
return DPDK_ETH_PORT_ID_INVALID;
}
RTE_ETH_FOREACH_DEV (port_id) {
struct rte_ether_addr ea;
rte_eth_macaddr_get(port_id, &ea);
memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
if (eth_addr_equals(mac, port_mac)) {
return port_id;
}
}
return DPDK_ETH_PORT_ID_INVALID;
}
/* Return the first DPDK port id matching the devargs pattern. */
static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs)
OVS_REQUIRES(dpdk_mutex)
{
dpdk_port_t port_id;
struct rte_dev_iterator iterator;
RTE_ETH_FOREACH_MATCHING_DEV (port_id, devargs, &iterator) {
/* If a break is done - must call rte_eth_iterator_cleanup. */
rte_eth_iterator_cleanup(&iterator);
break;
}
return port_id;
}
/*
* Normally, a PCI id (optionally followed by a representor identifier)
* is enough for identifying a specific DPDK port.
* However, for some NICs having multiple ports sharing the same PCI
* id, using PCI id won't work then.
*
* To fix that, here one more method is introduced: "class=eth,mac=$MAC".
*
* Note that the compatibility is fully kept: user can still use the
* PCI id for adding ports (when it's enough for them).
*/
static dpdk_port_t
netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
const char *devargs, char **errp)
OVS_REQUIRES(dpdk_mutex)
{
dpdk_port_t new_port_id;
if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
} else {
new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
if (!rte_eth_dev_is_valid_port(new_port_id)) {
/* Device not found in DPDK, attempt to attach it */
if (rte_dev_probe(devargs)) {
new_port_id = DPDK_ETH_PORT_ID_INVALID;
} else {
new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
if (rte_eth_dev_is_valid_port(new_port_id)) {
/* Attach successful */
dev->attached = true;
VLOG_INFO("Device '%s' attached to DPDK", devargs);
} else {
/* Attach unsuccessful */
new_port_id = DPDK_ETH_PORT_ID_INVALID;
}
}
}
}
if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
}
return new_port_id;
}
static struct seq *netdev_dpdk_reset_seq;
static uint64_t netdev_dpdk_last_reset_seq;
static atomic_bool netdev_dpdk_pending_reset[RTE_MAX_ETHPORTS];
static void
netdev_dpdk_wait(const struct netdev_class *netdev_class OVS_UNUSED)
{
uint64_t last_reset_seq = seq_read(netdev_dpdk_reset_seq);
if (netdev_dpdk_last_reset_seq == last_reset_seq) {
seq_wait(netdev_dpdk_reset_seq, netdev_dpdk_last_reset_seq);
} else {
poll_immediate_wake();
}
}
static void
netdev_dpdk_run(const struct netdev_class *netdev_class OVS_UNUSED)
{
uint64_t reset_seq = seq_read(netdev_dpdk_reset_seq);
if (reset_seq != netdev_dpdk_last_reset_seq) {
dpdk_port_t port_id;
netdev_dpdk_last_reset_seq = reset_seq;
for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) {
struct netdev_dpdk *dev;
bool pending_reset;
atomic_read_relaxed(&netdev_dpdk_pending_reset[port_id],
&pending_reset);
if (!pending_reset) {
continue;
}
ovs_mutex_lock(&dpdk_mutex);
dev = netdev_dpdk_lookup_by_port_id(port_id);
if (dev) {
ovs_mutex_lock(&dev->mutex);
netdev_request_reconfigure(&dev->up);
VLOG_DBG_RL(&rl, "%s: Device reset requested.",
netdev_get_name(&dev->up));
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
}
}
}
static int
dpdk_eth_event_callback(dpdk_port_t port_id, enum rte_eth_event_type type,
void *param OVS_UNUSED, void *ret_param OVS_UNUSED)
{
switch ((int) type) {
case RTE_ETH_EVENT_INTR_RESET:
atomic_store_relaxed(&netdev_dpdk_pending_reset[port_id], true);
seq_change(netdev_dpdk_reset_seq);
break;
default:
/* Ignore all other types. */
break;
}
return 0;
}
static void
dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
OVS_REQUIRES(dev->mutex)
{
int new_n_rxq;
new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
if (new_n_rxq != dev->user_n_rxq) {
dev->user_n_rxq = new_n_rxq;
netdev_request_reconfigure(&dev->up);
}
}
static void
dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
struct rte_eth_dev_info *info, bool is_rx)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_desc_lim *lim;
int default_size, queue_size, cur_size, new_requested_size;
int *cur_requested_size;
bool reconfig = false;
if (is_rx) {
default_size = NIC_PORT_DEFAULT_RXQ_SIZE;
new_requested_size = smap_get_int(args, "n_rxq_desc", default_size);
cur_size = dev->rxq_size;
cur_requested_size = &dev->requested_rxq_size;
lim = info ? &info->rx_desc_lim : NULL;
} else {
default_size = NIC_PORT_DEFAULT_TXQ_SIZE;
new_requested_size = smap_get_int(args, "n_txq_desc", default_size);
cur_size = dev->txq_size;
cur_requested_size = &dev->requested_txq_size;
lim = info ? &info->tx_desc_lim : NULL;
}
queue_size = new_requested_size;
/* Check for OVS limits. */
if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
|| !is_pow2(queue_size)) {
queue_size = default_size;
}
if (lim) {
/* Check for device limits. */
if (lim->nb_align) {
queue_size = ROUND_UP(queue_size, lim->nb_align);
}
queue_size = MIN(queue_size, lim->nb_max);
queue_size = MAX(queue_size, lim->nb_min);
}
*cur_requested_size = queue_size;
if (cur_size != queue_size) {
netdev_request_reconfigure(netdev);
reconfig = true;
}
if (new_requested_size != queue_size) {
VLOG(reconfig ? VLL_INFO : VLL_DBG,
"%s: Unable to set the number of %s descriptors to %d. "
"Adjusted to %d.", netdev_get_name(netdev),
is_rx ? "rx": "tx", new_requested_size, queue_size);
}
}
static void
dpdk_set_rx_steer_config(struct netdev *netdev, struct netdev_dpdk *dev,
const struct smap *args, char **errp)
{
const char *arg = smap_get_def(args, "rx-steering", "rss");
uint64_t flags = 0;
if (!strcmp(arg, "rss+lacp")) {
flags = DPDK_RX_STEER_LACP;
} else if (strcmp(arg, "rss")) {
VLOG_WARN_BUF(errp, "%s: options:rx-steering "
"unsupported parameter value '%s'",
netdev_get_name(netdev), arg);
}
if (flags && dev->type != DPDK_DEV_ETH) {
VLOG_WARN_BUF(errp, "%s: options:rx-steering "
"is only supported on ethernet ports",
netdev_get_name(netdev));
flags = 0;
}
if (flags && netdev_is_flow_api_enabled()) {
VLOG_WARN_BUF(errp, "%s: options:rx-steering "
"is incompatible with hw-offload",
netdev_get_name(netdev));
flags = 0;
}
if (flags != dev->requested_rx_steer_flags) {
dev->requested_rx_steer_flags = flags;
netdev_request_reconfigure(netdev);
}
}
static int
netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
char **errp)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
bool flow_control_requested = true;
enum rte_eth_fc_mode fc_mode;
static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
{RTE_ETH_FC_NONE, RTE_ETH_FC_TX_PAUSE},
{RTE_ETH_FC_RX_PAUSE, RTE_ETH_FC_FULL }
};
struct rte_eth_dev_info info;
const char *new_devargs;
const char *vf_mac;
int err = 0;
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
dpdk_set_rx_steer_config(netdev, dev, args, errp);
dpdk_set_rxq_config(dev, args);
new_devargs = smap_get(args, "dpdk-devargs");
if (dev->devargs && new_devargs && strcmp(new_devargs, dev->devargs)) {
/* The user requested a new device. If we return error, the caller
* will delete this netdev and try to recreate it. */
err = EAGAIN;
goto out;
}
/* dpdk-devargs is required for device configuration */
if (new_devargs && new_devargs[0]) {
/* Don't process dpdk-devargs if value is unchanged and port id
* is valid */
if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
&& rte_eth_dev_is_valid_port(dev->port_id))) {
dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
new_devargs,
errp);
if (!rte_eth_dev_is_valid_port(new_port_id)) {
err = EINVAL;
} else if (new_port_id == dev->port_id) {
/* Already configured, do not reconfigure again */
err = 0;
} else {
struct netdev_dpdk *dup_dev;
dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
if (dup_dev) {
VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
"which is already in use by '%s'",
netdev_get_name(netdev), new_devargs,
netdev_get_name(&dup_dev->up));
err = EADDRINUSE;
} else {
int sid = rte_eth_dev_socket_id(new_port_id);
dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
dev->devargs = xstrdup(new_devargs);
dev->port_id = new_port_id;
netdev_request_reconfigure(&dev->up);
err = 0;
}
}
}
} else {
VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
"The old 'dpdk<port_id>' names are not supported",
netdev_get_name(netdev));
err = EINVAL;
}
if (err) {
goto out;
}
err = -rte_eth_dev_info_get(dev->port_id, &info);
if (err) {
VLOG_WARN_BUF(errp, "%s: Failed to get device info: %s" ,
netdev_get_name(netdev), rte_strerror(err));
goto out;
}
dpdk_process_queue_size(netdev, args, &info, true);
dpdk_process_queue_size(netdev, args, &info, false);
vf_mac = smap_get(args, "dpdk-vf-mac");
if (vf_mac) {
struct eth_addr mac;
if (!dev->is_representor) {
VLOG_WARN("'%s' is trying to set the VF MAC '%s' "
"but 'options:dpdk-vf-mac' is only supported for "
"VF representors.",
netdev_get_name(netdev), vf_mac);
} else if (!eth_addr_from_string(vf_mac, &mac)) {
VLOG_WARN("interface '%s': cannot parse VF MAC '%s'.",
netdev_get_name(netdev), vf_mac);
} else if (eth_addr_is_multicast(mac)) {
VLOG_WARN("interface '%s': cannot set VF MAC to multicast "
"address '%s'.", netdev_get_name(netdev), vf_mac);
} else if (!eth_addr_equals(dev->requested_hwaddr, mac)) {
dev->requested_hwaddr = mac;
netdev_request_reconfigure(netdev);
}
}
lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", true);
if (lsc_interrupt_mode && !(*info.dev_flags & RTE_ETH_DEV_INTR_LSC)) {
if (smap_get(args, "dpdk-lsc-interrupt")) {
VLOG_WARN_BUF(errp, "'%s': link status interrupt is not "
"supported.", netdev_get_name(netdev));
err = EINVAL;
goto out;
}
VLOG_DBG("'%s': not enabling link status interrupt.",
netdev_get_name(netdev));
lsc_interrupt_mode = false;
}
if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
netdev_request_reconfigure(netdev);
}
rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
if (!smap_get(args, "rx-flow-ctrl") && !smap_get(args, "tx-flow-ctrl")
&& !smap_get(args, "flow-ctrl-autoneg")) {
/* FIXME: User didn't ask for flow control configuration.
* For now we'll not print a warning if flow control is not
* supported by the DPDK port. */
flow_control_requested = false;
}
/* Get the Flow control configuration. */
err = -rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
if (err) {
if (err == ENOTSUP) {
if (flow_control_requested) {
VLOG_WARN("%s: Flow control is not supported.",
netdev_get_name(netdev));
}
err = 0; /* Not fatal. */
} else {
VLOG_WARN_BUF(errp, "%s: Cannot get flow control parameters: %s",
netdev_get_name(netdev), rte_strerror(err));
}
goto out;
}
if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
dev->fc_conf.mode = fc_mode;
dev->fc_conf.autoneg = autoneg;
dpdk_eth_flow_ctrl_setup(dev);
}
out:
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
return err;
}
static int
netdev_dpdk_vhost_client_get_config(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int tx_retries_max;
ovs_mutex_lock(&dev->mutex);
if (dev->vhost_id) {
smap_add(args, "vhost-server-path", dev->vhost_id);
}
atomic_read_relaxed(&dev->vhost_tx_retries_max, &tx_retries_max);
if (tx_retries_max != VHOST_ENQ_RETRY_DEF) {
smap_add_format(args, "tx-retries-max", "%d", tx_retries_max);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
const struct smap *args,
char **errp OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const char *path;
int max_tx_retries, cur_max_tx_retries;
uint32_t max_queue_pairs;
ovs_mutex_lock(&dev->mutex);
if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
path = smap_get(args, "vhost-server-path");
if (!nullable_string_is_equal(path, dev->vhost_id)) {
free(dev->vhost_id);
dev->vhost_id = nullable_xstrdup(path);
max_queue_pairs = smap_get_int(args, "vhost-max-queue-pairs",
VHOST_MAX_QUEUE_PAIRS_DEF);
if (max_queue_pairs < VHOST_MAX_QUEUE_PAIRS_MIN
|| max_queue_pairs > VHOST_MAX_QUEUE_PAIRS_MAX) {
max_queue_pairs = VHOST_MAX_QUEUE_PAIRS_DEF;
}
dev->vhost_max_queue_pairs = max_queue_pairs;
netdev_request_reconfigure(netdev);
}
}
max_tx_retries = smap_get_int(args, "tx-retries-max",
VHOST_ENQ_RETRY_DEF);
if (max_tx_retries < VHOST_ENQ_RETRY_MIN
|| max_tx_retries > VHOST_ENQ_RETRY_MAX) {
max_tx_retries = VHOST_ENQ_RETRY_DEF;
}
atomic_read_relaxed(&dev->vhost_tx_retries_max, &cur_max_tx_retries);
if (max_tx_retries != cur_max_tx_retries) {
atomic_store_relaxed(&dev->vhost_tx_retries_max, max_tx_retries);
VLOG_INFO("Max Tx retries for vhost device '%s' set to %d",
netdev_get_name(netdev), max_tx_retries);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_numa_id(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
return dev->socket_id;
}
/* Sets the number of tx queues for the dpdk interface. */
static int
netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->requested_n_txq == n_txq) {
goto out;
}
dev->requested_n_txq = n_txq;
netdev_request_reconfigure(netdev);
out:
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static struct netdev_rxq *
netdev_dpdk_rxq_alloc(void)
{
struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
if (rx) {
return &rx->up;
}
return NULL;
}
static struct netdev_rxq_dpdk *
netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
{
return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
}
static int
netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
ovs_mutex_lock(&dev->mutex);
rx->port_id = dev->port_id;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static void
netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
{
}
static void
netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
rte_free(rx);
}
static inline void
netdev_dpdk_batch_init_packet_fields(struct dp_packet_batch *batch)
{
struct dp_packet *packet;
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
dp_packet_reset_cutlen(packet);
packet->packet_type = htonl(PT_ETH);
packet->has_hash = !!(packet->mbuf.ol_flags & RTE_MBUF_F_RX_RSS_HASH);
packet->has_mark = !!(packet->mbuf.ol_flags & RTE_MBUF_F_RX_FDIR_ID);
packet->offloads =
packet->mbuf.ol_flags & (RTE_MBUF_F_RX_IP_CKSUM_BAD
| RTE_MBUF_F_RX_IP_CKSUM_GOOD
| RTE_MBUF_F_RX_L4_CKSUM_BAD
| RTE_MBUF_F_RX_L4_CKSUM_GOOD);
}
}
/* Prepare the packet for HWOL.
* Return True if the packet is OK to continue. */
static bool
netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
{
struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
uint64_t unexpected = mbuf->ol_flags & RTE_MBUF_F_TX_OFFLOAD_MASK;
const struct ip_header *ip;
bool is_sctp;
bool l3_csum;
bool l4_csum;
bool is_tcp;
bool is_udp;
void *l2;
void *l3;
void *l4;
if (OVS_UNLIKELY(unexpected)) {
VLOG_WARN_RL(&rl, "%s: Unexpected Tx offload flags: %#"PRIx64,
netdev_get_name(&dev->up), unexpected);
netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up),
"Packet with unexpected ol_flags", mbuf);
return false;
}
if (!dp_packet_ip_checksum_partial(pkt)
&& !dp_packet_inner_ip_checksum_partial(pkt)
&& !dp_packet_l4_checksum_partial(pkt)
&& !dp_packet_inner_l4_checksum_partial(pkt)
&& !mbuf->tso_segsz) {
return true;
}
if (dp_packet_tunnel(pkt)
&& (dp_packet_inner_ip_checksum_partial(pkt)
|| dp_packet_inner_l4_checksum_partial(pkt)
|| mbuf->tso_segsz)) {
if (dp_packet_ip_checksum_partial(pkt)
|| dp_packet_l4_checksum_partial(pkt)) {
mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
(char *) dp_packet_eth(pkt);
mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
(char *) dp_packet_l3(pkt);
if (dp_packet_tunnel_geneve(pkt)) {
mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GENEVE;
} else if (dp_packet_tunnel_vxlan(pkt)) {
mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_VXLAN;
} else {
ovs_assert(dp_packet_tunnel_gre(pkt));
mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GRE;
}
if (dp_packet_ip_checksum_partial(pkt)) {
mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_IP_CKSUM;
}
if (dp_packet_l4_checksum_partial(pkt)) {
ovs_assert(dp_packet_l4_proto_udp(pkt));
mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_UDP_CKSUM;
}
ip = dp_packet_l3(pkt);
mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
? RTE_MBUF_F_TX_OUTER_IPV4
: RTE_MBUF_F_TX_OUTER_IPV6;
/* Inner L2 length must account for the tunnel header length. */
l2 = dp_packet_l4(pkt);
l3 = dp_packet_inner_l3(pkt);
l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
l4 = dp_packet_inner_l4(pkt);
l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
is_udp = dp_packet_inner_l4_proto_udp(pkt);
is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
} else {
mbuf->outer_l2_len = 0;
mbuf->outer_l3_len = 0;
/* Skip outer headers. */
l2 = dp_packet_eth(pkt);
l3 = dp_packet_inner_l3(pkt);
l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
l4 = dp_packet_inner_l4(pkt);
l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
is_udp = dp_packet_inner_l4_proto_udp(pkt);
is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
}
} else {
mbuf->outer_l2_len = 0;
mbuf->outer_l3_len = 0;
l2 = dp_packet_eth(pkt);
l3 = dp_packet_l3(pkt);
l3_csum = dp_packet_ip_checksum_partial(pkt);
l4 = dp_packet_l4(pkt);
l4_csum = dp_packet_l4_checksum_partial(pkt);
is_tcp = dp_packet_l4_proto_tcp(pkt);
is_udp = dp_packet_l4_proto_udp(pkt);
is_sctp = dp_packet_l4_proto_sctp(pkt);
}
ovs_assert(l4);
ip = l3;
mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
? RTE_MBUF_F_TX_IPV4 : RTE_MBUF_F_TX_IPV6;
if (l3_csum) {
mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
}
if (l4_csum) {
if (is_tcp) {
mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
} else if (is_udp) {
mbuf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
} else {
ovs_assert(is_sctp);
mbuf->ol_flags |= RTE_MBUF_F_TX_SCTP_CKSUM;
}
}
mbuf->l2_len = (char *) l3 - (char *) l2;
mbuf->l3_len = (char *) l4 - (char *) l3;
if (mbuf->tso_segsz) {
struct tcp_header *th = l4;
uint16_t link_tso_segsz;
int hdr_len;
mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
if (dp_packet_tunnel(pkt)) {
link_tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len -
mbuf->l4_len - mbuf->outer_l3_len;
} else {
link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
}
if (mbuf->tso_segsz > link_tso_segsz) {
mbuf->tso_segsz = link_tso_segsz;
}
hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) {
VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", "
"gso: %"PRIu32", max len: %"PRIu32"",
dev->up.name, hdr_len, mbuf->tso_segsz,
dev->max_packet_len);
return false;
}
mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
}
/* If L4 checksum is requested, IPv4 should be requested as well. */
if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK
&& mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
}
return true;
}
/* Prepare a batch for HWOL.
* Return the number of good packets in the batch. */
static int
netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int pkt_cnt)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt;
/* Prepare and filter bad HWOL packets. */
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
if (!netdev_dpdk_prep_hwol_packet(dev, pkt)) {
rte_pktmbuf_free(pkt);
continue;
}
if (OVS_UNLIKELY(i != cnt)) {
pkts[cnt] = pkt;
}
cnt++;
}
return cnt;
}
static void
netdev_dpdk_mbuf_dump(const char *prefix, const char *message,
const struct rte_mbuf *mbuf)
{
static struct vlog_rate_limit dump_rl = VLOG_RATE_LIMIT_INIT(5, 5);
char *response = NULL;
FILE *stream;
size_t size;
if (VLOG_DROP_DBG(&dump_rl)) {
return;
}
stream = open_memstream(&response, &size);
if (!stream) {
VLOG_ERR("Unable to open memstream for mbuf dump: %s.",
ovs_strerror(errno));
return;
}
rte_pktmbuf_dump(stream, mbuf, rte_pktmbuf_pkt_len(mbuf));
fclose(stream);
VLOG_DBG(prefix ? "%s: %s:\n%s" : "%s%s:\n%s",
prefix ? prefix : "", message, response);
free(response);
}
/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
* 'pkts', even in case of failure.
*
* Returns the number of packets that weren't transmitted. */
static inline int
netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
struct rte_mbuf **pkts, int cnt)
{
uint32_t nb_tx = 0;
uint16_t nb_tx_prep = cnt;
nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
if (nb_tx_prep != cnt) {
VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
"Only %u/%u are valid: %s", netdev_get_name(&dev->up),
nb_tx_prep, cnt, rte_strerror(rte_errno));
netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up),
"First invalid packet", pkts[nb_tx_prep]);
}
while (nb_tx != nb_tx_prep) {
uint32_t ret;
ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx,
nb_tx_prep - nb_tx);
if (!ret) {
break;
}
nb_tx += ret;
}
if (OVS_UNLIKELY(nb_tx != cnt)) {
/* Free buffers, which we couldn't transmit. */
rte_pktmbuf_free_bulk(&pkts[nb_tx], cnt - nb_tx);
}
return cnt - nb_tx;
}
static inline bool
netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm *meter,
struct rte_meter_srtcm_profile *profile,
struct rte_mbuf *pkt, uint64_t time)
{
uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct rte_ether_hdr);
return rte_meter_srtcm_color_blind_check(meter, profile, time, pkt_len) ==
RTE_COLOR_GREEN;
}
static int
srtcm_policer_run_single_packet(struct rte_meter_srtcm *meter,
struct rte_meter_srtcm_profile *profile,
struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt = NULL;
uint64_t current_time = rte_rdtsc();
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
/* Handle current packet */
if (netdev_dpdk_srtcm_policer_pkt_handle(meter, profile,
pkt, current_time)) {
if (cnt != i) {
pkts[cnt] = pkt;
}
cnt++;
} else {
if (should_steal) {
rte_pktmbuf_free(pkt);
}
}
}
return cnt;
}
static int
ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
int pkt_cnt, bool should_steal)
{
int cnt = 0;
rte_spinlock_lock(&policer->policer_lock);
cnt = srtcm_policer_run_single_packet(&policer->in_policer,
&policer->in_prof,
pkts, pkt_cnt, should_steal);
rte_spinlock_unlock(&policer->policer_lock);
return cnt;
}
static bool
is_vhost_running(struct netdev_dpdk *dev)
{
return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
}
/*
* The receive path for the vhost port is the TX path out from guest.
*/
static int
netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
struct dp_packet_batch *batch, int *qfill)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
uint16_t nb_rx = 0;
uint16_t qos_drops = 0;
int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
int vid = netdev_dpdk_get_vid(dev);
if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
|| !(dev->flags & NETDEV_UP))) {
return EAGAIN;
}
nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
(struct rte_mbuf **) batch->packets,
NETDEV_MAX_BURST);
if (!nb_rx) {
return EAGAIN;
}
if (qfill) {
if (nb_rx == NETDEV_MAX_BURST) {
/* The DPDK API returns a uint32_t which often has invalid bits in
* the upper 16-bits. Need to restrict the value to uint16_t. */
*qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
} else {
*qfill = 0;
}
}
if (policer) {
qos_drops = nb_rx;
nb_rx = ingress_policer_run(policer,
(struct rte_mbuf **) batch->packets,
nb_rx, true);
qos_drops -= nb_rx;
}
if (OVS_UNLIKELY(qos_drops)) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.rx_dropped += qos_drops;
dev->sw_stats->rx_qos_drops += qos_drops;
rte_spinlock_unlock(&dev->stats_lock);
}
batch->count = nb_rx;
netdev_dpdk_batch_init_packet_fields(batch);
return 0;
}
static bool
netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
return dev->vhost_rxq_enabled[rxq->queue_id];
}
static int
netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
int *qfill)
{
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
int nb_rx;
int dropped = 0;
if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
return EAGAIN;
}
nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
(struct rte_mbuf **) batch->packets,
NETDEV_MAX_BURST);
if (!nb_rx) {
return EAGAIN;
}
if (policer) {
dropped = nb_rx;
nb_rx = ingress_policer_run(policer,
(struct rte_mbuf **) batch->packets,
nb_rx, true);
dropped -= nb_rx;
}
/* Update stats to reflect dropped packets */
if (OVS_UNLIKELY(dropped)) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.rx_dropped += dropped;
dev->sw_stats->rx_qos_drops += dropped;
rte_spinlock_unlock(&dev->stats_lock);
}
batch->count = nb_rx;
netdev_dpdk_batch_init_packet_fields(batch);
if (qfill) {
if (nb_rx == NETDEV_MAX_BURST) {
*qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
} else {
*qfill = 0;
}
}
return 0;
}
static inline int
netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int cnt, bool should_steal)
{
struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
if (qos_conf) {
rte_spinlock_lock(&qos_conf->lock);
cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
rte_spinlock_unlock(&qos_conf->lock);
}
return cnt;
}
static int
netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
int pkt_cnt)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt;
/* Filter oversized packets. The TSO packets are filtered out
* during the offloading preparation for performance reasons. */
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len)
&& !pkt->tso_segsz)) {
VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " "
"max_packet_len %d", dev->up.name, pkt->pkt_len,
dev->max_packet_len);
rte_pktmbuf_free(pkt);
continue;
}
if (OVS_UNLIKELY(i != cnt)) {
pkts[cnt] = pkt;
}
cnt++;
}
return cnt;
}
static void
netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
{
rte_free(opaque);
}
static struct rte_mbuf *
dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
{
uint32_t total_len = RTE_PKTMBUF_HEADROOM + data_len;
struct rte_mbuf_ext_shared_info *shinfo = NULL;
uint16_t buf_len;
void *buf;
total_len += sizeof *shinfo + sizeof(uintptr_t);
total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
VLOG_ERR("Can't copy packet: too big %u", total_len);
return NULL;
}
buf_len = total_len;
buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
if (OVS_UNLIKELY(buf == NULL)) {
VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
return NULL;
}
/* Initialize shinfo. */
shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
netdev_dpdk_extbuf_free,
buf);
if (OVS_UNLIKELY(shinfo == NULL)) {
rte_free(buf);
VLOG_ERR("Failed to initialize shared info for mbuf while "
"attempting to attach an external buffer.");
return NULL;
}
rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
shinfo);
rte_pktmbuf_reset_headroom(pkt);
return pkt;
}
static struct rte_mbuf *
dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len)
{
struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
if (OVS_UNLIKELY(!pkt)) {
return NULL;
}
if (rte_pktmbuf_tailroom(pkt) >= data_len) {
return pkt;
}
if (dpdk_pktmbuf_attach_extbuf(pkt, data_len)) {
return pkt;
}
rte_pktmbuf_free(pkt);
return NULL;
}
static struct dp_packet *
dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
{
struct rte_mbuf *mbuf_dest;
struct dp_packet *pkt_dest;
uint32_t pkt_len;
pkt_len = dp_packet_size(pkt_orig);
mbuf_dest = dpdk_pktmbuf_alloc(mp, pkt_len);
if (OVS_UNLIKELY(mbuf_dest == NULL)) {
return NULL;
}
pkt_dest = CONTAINER_OF(mbuf_dest, struct dp_packet, mbuf);
memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig), pkt_len);
dp_packet_set_size(pkt_dest, pkt_len);
mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT));
mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz;
memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
if (dp_packet_l3(pkt_dest)) {
if (dp_packet_eth(pkt_dest)) {
mbuf_dest->l2_len = (char *) dp_packet_l3(pkt_dest)
- (char *) dp_packet_eth(pkt_dest);
} else {
mbuf_dest->l2_len = 0;
}
if (dp_packet_l4(pkt_dest)) {
mbuf_dest->l3_len = (char *) dp_packet_l4(pkt_dest)
- (char *) dp_packet_l3(pkt_dest);
} else {
mbuf_dest->l3_len = 0;
}
}
return pkt_dest;
}
/* Replace packets in a 'batch' with their corresponding copies using
* DPDK memory.
*
* Returns the number of good packets in the batch. */
static size_t
dpdk_copy_batch_to_mbuf(struct netdev *netdev, struct dp_packet_batch *batch)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
size_t i, size = dp_packet_batch_size(batch);
struct dp_packet *packet;
DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
if (OVS_UNLIKELY(packet->source == DPBUF_DPDK)) {
dp_packet_batch_refill(batch, packet, i);
} else {
struct dp_packet *pktcopy;
pktcopy = dpdk_copy_dp_packet_to_mbuf(dev->dpdk_mp->mp, packet);
if (pktcopy) {
dp_packet_batch_refill(batch, pktcopy, i);
}
dp_packet_delete(packet);
}
}
return dp_packet_batch_size(batch);
}
static size_t
netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch,
struct netdev_dpdk_sw_stats *stats)
{
struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
size_t cnt, pkt_cnt = dp_packet_batch_size(batch);
struct dp_packet *packet;
bool need_copy = false;
memset(stats, 0, sizeof *stats);
DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
if (packet->source != DPBUF_DPDK) {
need_copy = true;
break;
}
}
/* Copy dp-packets to mbufs. */
if (OVS_UNLIKELY(need_copy)) {
cnt = dpdk_copy_batch_to_mbuf(netdev, batch);
stats->tx_failure_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
}
/* Drop oversized packets. */
cnt = netdev_dpdk_filter_packet_len(dev, pkts, pkt_cnt);
stats->tx_mtu_exceeded_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
/* Prepare each mbuf for hardware offloading. */
cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt);
stats->tx_invalid_hwol_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
/* Apply Quality of Service policy. */
cnt = netdev_dpdk_qos_run(dev, pkts, pkt_cnt, true);
stats->tx_qos_drops += pkt_cnt - cnt;
return cnt;
}
static int
netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch,
bool concurrent_txq OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int max_retries = VHOST_ENQ_RETRY_MIN;
int cnt, batch_cnt, vhost_batch_cnt;
int vid = netdev_dpdk_get_vid(dev);
struct netdev_dpdk_sw_stats stats;
struct rte_mbuf **pkts;
int dropped;
int retries;
batch_cnt = cnt = dp_packet_batch_size(batch);
qid = dev->tx_q[qid % netdev->n_txq].map;
if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
|| !(dev->flags & NETDEV_UP))) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += cnt;
rte_spinlock_unlock(&dev->stats_lock);
dp_packet_delete_batch(batch, true);
return 0;
}
if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) {
COVERAGE_INC(vhost_tx_contention);
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
}
cnt = netdev_dpdk_common_send(netdev, batch, &stats);
dropped = batch_cnt - cnt;
pkts = (struct rte_mbuf **) batch->packets;
vhost_batch_cnt = cnt;
retries = 0;
do {
int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
int tx_pkts;
tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, pkts, cnt);
if (OVS_LIKELY(tx_pkts)) {
/* Packets have been sent.*/
cnt -= tx_pkts;
/* Prepare for possible retry.*/
pkts = &pkts[tx_pkts];
if (OVS_UNLIKELY(cnt && !retries)) {
/*
* Read max retries as there are packets not sent
* and no retries have already occurred.
*/
atomic_read_relaxed(&dev->vhost_tx_retries_max, &max_retries);
}
} else {
/* No packets sent - do not retry.*/
break;
}
} while (cnt && (retries++ < max_retries));
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
stats.tx_failure_drops += cnt;
dropped += cnt;
stats.tx_retries = MIN(retries, max_retries);
if (OVS_UNLIKELY(dropped || stats.tx_retries)) {
struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dropped;
sw_stats->tx_retries += stats.tx_retries;
sw_stats->tx_failure_drops += stats.tx_failure_drops;
sw_stats->tx_mtu_exceeded_drops += stats.tx_mtu_exceeded_drops;
sw_stats->tx_qos_drops += stats.tx_qos_drops;
sw_stats->tx_invalid_hwol_drops += stats.tx_invalid_hwol_drops;
rte_spinlock_unlock(&dev->stats_lock);
}
pkts = (struct rte_mbuf **) batch->packets;
rte_pktmbuf_free_bulk(pkts, vhost_batch_cnt);
return 0;
}
static int
netdev_dpdk_eth_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch, bool concurrent_txq)
{
struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int batch_cnt = dp_packet_batch_size(batch);
struct netdev_dpdk_sw_stats stats;
int cnt, dropped;
if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dp_packet_batch_size(batch);
rte_spinlock_unlock(&dev->stats_lock);
dp_packet_delete_batch(batch, true);
return 0;
}
if (OVS_UNLIKELY(concurrent_txq)) {
qid = qid % dev->up.n_txq;
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
}
cnt = netdev_dpdk_common_send(netdev, batch, &stats);
dropped = netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt);
stats.tx_failure_drops += dropped;
dropped += batch_cnt - cnt;
if (OVS_UNLIKELY(dropped)) {
struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped += dropped;
sw_stats->tx_failure_drops += stats.tx_failure_drops;
sw_stats->tx_mtu_exceeded_drops += stats.tx_mtu_exceeded_drops;
sw_stats->tx_qos_drops += stats.tx_qos_drops;
sw_stats->tx_invalid_hwol_drops += stats.tx_invalid_hwol_drops;
rte_spinlock_unlock(&dev->stats_lock);
}
if (OVS_UNLIKELY(concurrent_txq)) {
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
}
return 0;
}
static int
netdev_dpdk_set_etheraddr__(struct netdev_dpdk *dev, const struct eth_addr mac)
OVS_REQUIRES(dev->mutex)
{
int err = 0;
if (dev->type == DPDK_DEV_ETH) {
struct rte_ether_addr ea;
memcpy(ea.addr_bytes, mac.ea, ETH_ADDR_LEN);
err = -rte_eth_dev_default_mac_addr_set(dev->port_id, &ea);
}
if (!err) {
dev->hwaddr = mac;
} else {
VLOG_WARN("%s: Failed to set requested mac("ETH_ADDR_FMT"): %s",
netdev_get_name(&dev->up), ETH_ADDR_ARGS(mac),
rte_strerror(err));
}
return err;
}
static int
netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err = 0;
ovs_mutex_lock(&dev->mutex);
if (!eth_addr_equals(dev->hwaddr, mac)) {
err = netdev_dpdk_set_etheraddr__(dev, mac);
if (!err) {
netdev_change_seq_changed(netdev);
}
}
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
*mac = dev->hwaddr;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
*mtup = dev->mtu;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
/* XXX: Ensure that the overall frame length of the requested MTU does not
* surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
* the L2 frame length is calculated for a given MTU when
* rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
* headers, the em driver includes 1 x vlan header, the ixgbe driver does
* not include vlan headers. As such we should use
* MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
* (8 bytes) for comparison. This avoids a failure later with
* rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
* a method to retrieve the upper bound MTU for a given device.
*/
if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
|| mtu < RTE_ETHER_MIN_MTU) {
VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
return EINVAL;
}
ovs_mutex_lock(&dev->mutex);
if (dev->requested_mtu != mtu) {
dev->requested_mtu = mtu;
netdev_request_reconfigure(netdev);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
struct netdev_stats *stats)
{
struct rte_vhost_stat_name *vhost_stats_names = NULL;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_vhost_stat *vhost_stats = NULL;
int vhost_stats_count;
int err;
int qid;
int vid;
ovs_mutex_lock(&dev->mutex);
if (!is_vhost_running(dev)) {
err = EPROTO;
goto out;
}
vid = netdev_dpdk_get_vid(dev);
/* We expect all rxqs have the same number of stats, only query rxq0. */
qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
err = EPROTO;
goto out;
}
vhost_stats_count = err;
vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats);
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
#define VHOST_RXQ_STATS \
VHOST_RXQ_STAT(rx_packets, "good_packets") \
VHOST_RXQ_STAT(rx_bytes, "good_bytes") \
VHOST_RXQ_STAT(rx_broadcast_packets, "broadcast_packets") \
VHOST_RXQ_STAT(multicast, "multicast_packets") \
VHOST_RXQ_STAT(rx_undersized_errors, "undersize_packets") \
VHOST_RXQ_STAT(rx_1_to_64_packets, "size_64_packets") \
VHOST_RXQ_STAT(rx_65_to_127_packets, "size_65_127_packets") \
VHOST_RXQ_STAT(rx_128_to_255_packets, "size_128_255_packets") \
VHOST_RXQ_STAT(rx_256_to_511_packets, "size_256_511_packets") \
VHOST_RXQ_STAT(rx_512_to_1023_packets, "size_512_1023_packets") \
VHOST_RXQ_STAT(rx_1024_to_1522_packets, "size_1024_1518_packets") \
VHOST_RXQ_STAT(rx_1523_to_max_packets, "size_1519_max_packets")
#define VHOST_RXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0;
VHOST_RXQ_STATS;
#undef VHOST_RXQ_STAT
for (int q = 0; q < dev->up.n_rxq; q++) {
qid = q * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
for (int i = 0; i < vhost_stats_count; i++) {
#define VHOST_RXQ_STAT(MEMBER, NAME) \
if (string_ends_with(vhost_stats_names[i].name, NAME)) { \
dev->stats.MEMBER += vhost_stats[i].value; \
continue; \
}
VHOST_RXQ_STATS;
#undef VHOST_RXQ_STAT
}
}
/* OVS reports 64 bytes and smaller packets into "rx_1_to_64_packets".
* Since vhost only reports good packets and has no error counter,
* rx_undersized_errors is highjacked (see above) to retrieve
* "undersize_packets". */
dev->stats.rx_1_to_64_packets += dev->stats.rx_undersized_errors;
memset(&dev->stats.rx_undersized_errors, 0xff,
sizeof dev->stats.rx_undersized_errors);
#define VHOST_RXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER;
VHOST_RXQ_STATS;
#undef VHOST_RXQ_STAT
free(vhost_stats_names);
vhost_stats_names = NULL;
free(vhost_stats);
vhost_stats = NULL;
/* We expect all txqs have the same number of stats, only query txq0. */
qid = 0 * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
err = EPROTO;
goto out;
}
vhost_stats_count = err;
vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats);
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
#define VHOST_TXQ_STATS \
VHOST_TXQ_STAT(tx_packets, "good_packets") \
VHOST_TXQ_STAT(tx_bytes, "good_bytes") \
VHOST_TXQ_STAT(tx_broadcast_packets, "broadcast_packets") \
VHOST_TXQ_STAT(tx_multicast_packets, "multicast_packets") \
VHOST_TXQ_STAT(rx_undersized_errors, "undersize_packets") \
VHOST_TXQ_STAT(tx_1_to_64_packets, "size_64_packets") \
VHOST_TXQ_STAT(tx_65_to_127_packets, "size_65_127_packets") \
VHOST_TXQ_STAT(tx_128_to_255_packets, "size_128_255_packets") \
VHOST_TXQ_STAT(tx_256_to_511_packets, "size_256_511_packets") \
VHOST_TXQ_STAT(tx_512_to_1023_packets, "size_512_1023_packets") \
VHOST_TXQ_STAT(tx_1024_to_1522_packets, "size_1024_1518_packets") \
VHOST_TXQ_STAT(tx_1523_to_max_packets, "size_1519_max_packets")
#define VHOST_TXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0;
VHOST_TXQ_STATS;
#undef VHOST_TXQ_STAT
for (int q = 0; q < dev->up.n_txq; q++) {
qid = q * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_stats_count);
if (err != vhost_stats_count) {
err = EPROTO;
goto out;
}
for (int i = 0; i < vhost_stats_count; i++) {
#define VHOST_TXQ_STAT(MEMBER, NAME) \
if (string_ends_with(vhost_stats_names[i].name, NAME)) { \
dev->stats.MEMBER += vhost_stats[i].value; \
continue; \
}
VHOST_TXQ_STATS;
#undef VHOST_TXQ_STAT
}
}
/* OVS reports 64 bytes and smaller packets into "tx_1_to_64_packets".
* Same as for rx, rx_undersized_errors is highjacked. */
dev->stats.tx_1_to_64_packets += dev->stats.rx_undersized_errors;
memset(&dev->stats.rx_undersized_errors, 0xff,
sizeof dev->stats.rx_undersized_errors);
#define VHOST_TXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER;
VHOST_TXQ_STATS;
#undef VHOST_TXQ_STAT
rte_spinlock_lock(&dev->stats_lock);
stats->rx_dropped = dev->stats.rx_dropped;
stats->tx_dropped = dev->stats.tx_dropped;
rte_spinlock_unlock(&dev->stats_lock);
err = 0;
out:
ovs_mutex_unlock(&dev->mutex);
free(vhost_stats);
free(vhost_stats_names);
return err;
}
static int
netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats)
{
struct rte_vhost_stat_name *vhost_stats_names = NULL;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_vhost_stat *vhost_stats = NULL;
int vhost_rxq_stats_count;
int vhost_txq_stats_count;
int stat_offset;
int err;
int qid;
int vid;
netdev_dpdk_get_sw_custom_stats(netdev, custom_stats);
stat_offset = custom_stats->size;
ovs_mutex_lock(&dev->mutex);
if (!is_vhost_running(dev)) {
goto out;
}
vid = netdev_dpdk_get_vid(dev);
qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
goto out;
}
vhost_rxq_stats_count = err;
qid = 0 * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0);
if (err < 0) {
goto out;
}
vhost_txq_stats_count = err;
stat_offset += dev->up.n_rxq * vhost_rxq_stats_count;
stat_offset += dev->up.n_txq * vhost_txq_stats_count;
custom_stats->counters = xrealloc(custom_stats->counters,
stat_offset *
sizeof *custom_stats->counters);
stat_offset = custom_stats->size;
vhost_stats_names = xcalloc(vhost_rxq_stats_count,
sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_rxq_stats_count, sizeof *vhost_stats);
for (int q = 0; q < dev->up.n_rxq; q++) {
qid = q * VIRTIO_QNUM + VIRTIO_TXQ;
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_rxq_stats_count);
if (err != vhost_rxq_stats_count) {
goto out;
}
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_rxq_stats_count);
if (err != vhost_rxq_stats_count) {
goto out;
}
for (int i = 0; i < vhost_rxq_stats_count; i++) {
ovs_strlcpy(custom_stats->counters[stat_offset + i].name,
vhost_stats_names[i].name,
NETDEV_CUSTOM_STATS_NAME_SIZE);
custom_stats->counters[stat_offset + i].value =
vhost_stats[i].value;
}
stat_offset += vhost_rxq_stats_count;
}
free(vhost_stats_names);
vhost_stats_names = NULL;
free(vhost_stats);
vhost_stats = NULL;
vhost_stats_names = xcalloc(vhost_txq_stats_count,
sizeof *vhost_stats_names);
vhost_stats = xcalloc(vhost_txq_stats_count, sizeof *vhost_stats);
for (int q = 0; q < dev->up.n_txq; q++) {
qid = q * VIRTIO_QNUM;
err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names,
vhost_txq_stats_count);
if (err != vhost_txq_stats_count) {
goto out;
}
err = rte_vhost_vring_stats_get(vid, qid, vhost_stats,
vhost_txq_stats_count);
if (err != vhost_txq_stats_count) {
goto out;
}
for (int i = 0; i < vhost_txq_stats_count; i++) {
ovs_strlcpy(custom_stats->counters[stat_offset + i].name,
vhost_stats_names[i].name,
NETDEV_CUSTOM_STATS_NAME_SIZE);
custom_stats->counters[stat_offset + i].value =
vhost_stats[i].value;
}
stat_offset += vhost_txq_stats_count;
}
out:
ovs_mutex_unlock(&dev->mutex);
custom_stats->size = stat_offset;
free(vhost_stats_names);
free(vhost_stats);
return 0;
}
static void
netdev_dpdk_convert_xstats(struct netdev_stats *stats,
const struct rte_eth_xstat *xstats,
const struct rte_eth_xstat_name *names,
const unsigned int size)
{
/* DPDK XSTATS Counter names definition. */
#define DPDK_XSTATS \
DPDK_XSTAT(multicast, "rx_multicast_packets" ) \
DPDK_XSTAT(tx_multicast_packets, "tx_multicast_packets" ) \
DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
DPDK_XSTAT(rx_1_to_64_packets, "rx_size_64_packets" ) \
DPDK_XSTAT(rx_65_to_127_packets, "rx_size_65_to_127_packets" ) \
DPDK_XSTAT(rx_128_to_255_packets, "rx_size_128_to_255_packets" ) \
DPDK_XSTAT(rx_256_to_511_packets, "rx_size_256_to_511_packets" ) \
DPDK_XSTAT(rx_512_to_1023_packets, "rx_size_512_to_1023_packets" ) \
DPDK_XSTAT(rx_1024_to_1522_packets, "rx_size_1024_to_1522_packets" ) \
DPDK_XSTAT(rx_1523_to_max_packets, "rx_size_1523_to_max_packets" ) \
DPDK_XSTAT(tx_1_to_64_packets, "tx_size_64_packets" ) \
DPDK_XSTAT(tx_65_to_127_packets, "tx_size_65_to_127_packets" ) \
DPDK_XSTAT(tx_128_to_255_packets, "tx_size_128_to_255_packets" ) \
DPDK_XSTAT(tx_256_to_511_packets, "tx_size_256_to_511_packets" ) \
DPDK_XSTAT(tx_512_to_1023_packets, "tx_size_512_to_1023_packets" ) \
DPDK_XSTAT(tx_1024_to_1522_packets, "tx_size_1024_to_1522_packets" ) \
DPDK_XSTAT(tx_1523_to_max_packets, "tx_size_1523_to_max_packets" )
for (unsigned int i = 0; i < size; i++) {
#define DPDK_XSTAT(MEMBER, NAME) \
if (strcmp(NAME, names[i].name) == 0) { \
stats->MEMBER = xstats[i].value; \
continue; \
}
DPDK_XSTATS;
#undef DPDK_XSTAT
}
#undef DPDK_XSTATS
}
static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
static int
netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_stats rte_stats;
bool gg;
netdev_dpdk_get_carrier(netdev, &gg);
ovs_mutex_lock(&dev->mutex);
struct rte_eth_xstat *rte_xstats = NULL;
struct rte_eth_xstat_name *rte_xstats_names = NULL;
int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
dev->port_id);
ovs_mutex_unlock(&dev->mutex);
return EPROTO;
}
/* Get length of statistics */
rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
if (rte_xstats_len < 0) {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
/* Reserve memory for xstats names and values */
rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
/* Retreive xstats names */
rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
rte_xstats_names,
rte_xstats_len);
if (rte_xstats_new_len != rte_xstats_len) {
VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
dev->port_id);
goto out;
}
/* Retreive xstats values */
memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
rte_xstats_len);
if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
rte_xstats_len);
} else {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
}
out:
free(rte_xstats);
free(rte_xstats_names);
stats->rx_packets = rte_stats.ipackets;
stats->tx_packets = rte_stats.opackets;
stats->rx_bytes = rte_stats.ibytes;
stats->tx_bytes = rte_stats.obytes;
stats->rx_errors = rte_stats.ierrors;
stats->tx_errors = rte_stats.oerrors;
rte_spinlock_lock(&dev->stats_lock);
stats->tx_dropped = dev->stats.tx_dropped;
stats->rx_dropped = dev->stats.rx_dropped;
rte_spinlock_unlock(&dev->stats_lock);
/* These are the available DPDK counters for packets not received due to
* local resource constraints in DPDK and NIC respectively. */
stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
stats->rx_missed_errors = rte_stats.imissed;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_custom_stats(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats)
{
uint32_t i;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int rte_xstats_ret, sw_stats_size;
netdev_dpdk_get_sw_custom_stats(netdev, custom_stats);
ovs_mutex_lock(&dev->mutex);
if (dev->rte_xstats_ids_size > 0) {
uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
sizeof(uint64_t));
rte_xstats_ret =
rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
values, dev->rte_xstats_ids_size);
if (rte_xstats_ret > 0 &&
rte_xstats_ret <= dev->rte_xstats_ids_size) {
sw_stats_size = custom_stats->size;
custom_stats->size += rte_xstats_ret;
custom_stats->counters = xrealloc(custom_stats->counters,
custom_stats->size *
sizeof *custom_stats->counters);
for (i = 0; i < rte_xstats_ret; i++) {
ovs_strlcpy(custom_stats->counters[sw_stats_size + i].name,
netdev_dpdk_get_xstat_name(dev,
dev->rte_xstats_ids[i]),
NETDEV_CUSTOM_STATS_NAME_SIZE);
custom_stats->counters[sw_stats_size + i].value = values[i];
}
} else {
VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
dev->port_id);
}
free(values);
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_sw_custom_stats(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int i, n;
#define SW_CSTATS \
SW_CSTAT(tx_retries) \
SW_CSTAT(tx_failure_drops) \
SW_CSTAT(tx_mtu_exceeded_drops) \
SW_CSTAT(tx_qos_drops) \
SW_CSTAT(rx_qos_drops) \
SW_CSTAT(tx_invalid_hwol_drops)
#define SW_CSTAT(NAME) + 1
custom_stats->size = SW_CSTATS;
#undef SW_CSTAT
custom_stats->counters = xcalloc(custom_stats->size,
sizeof *custom_stats->counters);
ovs_mutex_lock(&dev->mutex);
rte_spinlock_lock(&dev->stats_lock);
i = 0;
#define SW_CSTAT(NAME) \
custom_stats->counters[i++].value = dev->sw_stats->NAME;
SW_CSTATS;
#undef SW_CSTAT
rte_spinlock_unlock(&dev->stats_lock);
ovs_mutex_unlock(&dev->mutex);
i = 0;
n = 0;
#define SW_CSTAT(NAME) \
if (custom_stats->counters[i].value != UINT64_MAX) { \
ovs_strlcpy(custom_stats->counters[n].name, \
"ovs_"#NAME, NETDEV_CUSTOM_STATS_NAME_SIZE); \
custom_stats->counters[n].value = custom_stats->counters[i].value; \
n++; \
} \
i++;
SW_CSTATS;
#undef SW_CSTAT
custom_stats->size = n;
return 0;
}
static int
netdev_dpdk_get_features(const struct netdev *netdev,
enum netdev_features *current,
enum netdev_features *advertised,
enum netdev_features *supported,
enum netdev_features *peer)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_link link;
uint32_t feature = 0;
ovs_mutex_lock(&dev->mutex);
link = dev->link;
ovs_mutex_unlock(&dev->mutex);
/* Match against OpenFlow defined link speed values. */
if (link.link_duplex == RTE_ETH_LINK_FULL_DUPLEX) {
switch (link.link_speed) {
case RTE_ETH_SPEED_NUM_10M:
feature |= NETDEV_F_10MB_FD;
break;
case RTE_ETH_SPEED_NUM_100M:
feature |= NETDEV_F_100MB_FD;
break;
case RTE_ETH_SPEED_NUM_1G:
feature |= NETDEV_F_1GB_FD;
break;
case RTE_ETH_SPEED_NUM_10G:
feature |= NETDEV_F_10GB_FD;
break;
case RTE_ETH_SPEED_NUM_40G:
feature |= NETDEV_F_40GB_FD;
break;
case RTE_ETH_SPEED_NUM_100G:
feature |= NETDEV_F_100GB_FD;
break;
default:
feature |= NETDEV_F_OTHER;
}
} else if (link.link_duplex == RTE_ETH_LINK_HALF_DUPLEX) {
switch (link.link_speed) {
case RTE_ETH_SPEED_NUM_10M:
feature |= NETDEV_F_10MB_HD;
break;
case RTE_ETH_SPEED_NUM_100M:
feature |= NETDEV_F_100MB_HD;
break;
case RTE_ETH_SPEED_NUM_1G:
feature |= NETDEV_F_1GB_HD;
break;
default:
feature |= NETDEV_F_OTHER;
}
}
if (link.link_autoneg) {
feature |= NETDEV_F_AUTONEG;
}
*current = feature;
*advertised = *supported = *peer = 0;
return 0;
}
static int
netdev_dpdk_get_speed(const struct netdev *netdev, uint32_t *current,
uint32_t *max)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
struct rte_eth_link link;
int diag;
ovs_mutex_lock(&dev->mutex);
link = dev->link;
diag = rte_eth_dev_info_get(dev->port_id, &dev_info);
ovs_mutex_unlock(&dev->mutex);
*current = link.link_speed != RTE_ETH_SPEED_NUM_UNKNOWN
? link.link_speed : 0;
if (diag < 0) {
*max = 0;
goto out;
}
if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_200G) {
*max = RTE_ETH_SPEED_NUM_200G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100G) {
*max = RTE_ETH_SPEED_NUM_100G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_56G) {
*max = RTE_ETH_SPEED_NUM_56G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_50G) {
*max = RTE_ETH_SPEED_NUM_50G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_40G) {
*max = RTE_ETH_SPEED_NUM_40G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_25G) {
*max = RTE_ETH_SPEED_NUM_25G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_20G) {
*max = RTE_ETH_SPEED_NUM_20G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10G) {
*max = RTE_ETH_SPEED_NUM_10G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_5G) {
*max = RTE_ETH_SPEED_NUM_5G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_2_5G) {
*max = RTE_ETH_SPEED_NUM_2_5G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_1G) {
*max = RTE_ETH_SPEED_NUM_1G;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M ||
dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M_HD) {
*max = RTE_ETH_SPEED_NUM_100M;
} else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M ||
dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M_HD) {
*max = RTE_ETH_SPEED_NUM_10M;
} else {
*max = 0;
}
out:
return 0;
}
static struct ingress_policer *
netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
{
struct ingress_policer *policer = NULL;
uint64_t rate_bytes;
uint64_t burst_bytes;
int err = 0;
policer = xmalloc(sizeof *policer);
rte_spinlock_init(&policer->policer_lock);
/* rte_meter requires bytes so convert kbits rate and burst to bytes. */
rate_bytes = rate * 1000ULL / 8;
burst_bytes = burst * 1000ULL / 8;
policer->app_srtcm_params.cir = rate_bytes;
policer->app_srtcm_params.cbs = burst_bytes;
policer->app_srtcm_params.ebs = 0;
err = rte_meter_srtcm_profile_config(&policer->in_prof,
&policer->app_srtcm_params);
if (!err) {
err = rte_meter_srtcm_config(&policer->in_policer,
&policer->in_prof);
}
if (err) {
VLOG_ERR("Could not create rte meter for ingress policer");
free(policer);
return NULL;
}
return policer;
}
static int
netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
uint32_t policer_burst,
uint32_t policer_kpkts_rate OVS_UNUSED,
uint32_t policer_kpkts_burst OVS_UNUSED)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct ingress_policer *policer;
/* Force to 0 if no rate specified,
* default to 8000 kbits if burst is 0,
* else stick with user-specified value.
*/
policer_burst = (!policer_rate ? 0
: !policer_burst ? 8000
: policer_burst);
ovs_mutex_lock(&dev->mutex);
policer = ovsrcu_get_protected(struct ingress_policer *,
&dev->ingress_policer);
if (dev->policer_rate == policer_rate &&
dev->policer_burst == policer_burst) {
/* Assume that settings haven't changed since we last set them. */
ovs_mutex_unlock(&dev->mutex);
return 0;
}
/* Destroy any existing ingress policer for the device if one exists */
if (policer) {
ovsrcu_postpone(free, policer);
}
if (policer_rate != 0) {
policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
} else {
policer = NULL;
}
ovsrcu_set(&dev->ingress_policer, policer);
dev->policer_rate = policer_rate;
dev->policer_burst = policer_burst;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_get_ifindex(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
/* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
* postive integer to meet RFC 2863 recommendations.
*/
int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
ovs_mutex_unlock(&dev->mutex);
return ifindex;
}
static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
check_link_status(dev);
*carrier = dev->link.link_status;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static int
netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (is_vhost_running(dev)) {
*carrier = 1;
} else {
*carrier = 0;
}
ovs_mutex_unlock(&dev->mutex);
return 0;
}
static long long int
netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
long long int carrier_resets;
ovs_mutex_lock(&dev->mutex);
carrier_resets = dev->link_reset_cnt;
ovs_mutex_unlock(&dev->mutex);
return carrier_resets;
}
static int
netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
long long int interval OVS_UNUSED)
{
return EOPNOTSUPP;
}
static int
netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
enum netdev_flags off, enum netdev_flags on,
enum netdev_flags *old_flagsp)
OVS_REQUIRES(dev->mutex)
{
if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
return EINVAL;
}
*old_flagsp = dev->flags;
dev->flags |= on;
dev->flags &= ~off;
if (dev->flags == *old_flagsp) {
return 0;
}
if (dev->type == DPDK_DEV_ETH) {
if ((dev->flags ^ *old_flagsp) & NETDEV_UP) {
int err;
if (dev->flags & NETDEV_UP) {
err = rte_eth_dev_set_link_up(dev->port_id);
} else {
err = rte_eth_dev_set_link_down(dev->port_id);
}
if (err == -ENOTSUP) {
VLOG_INFO("Interface %s does not support link state "
"configuration", netdev_get_name(&dev->up));
} else if (err < 0) {
VLOG_ERR("Interface %s link change error: %s",
netdev_get_name(&dev->up), rte_strerror(-err));
dev->flags = *old_flagsp;
return -err;
}
}
if (dev->flags & NETDEV_PROMISC) {
rte_eth_promiscuous_enable(dev->port_id);
}
netdev_change_seq_changed(&dev->up);
} else {
/* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
* running then change netdev's change_seq to trigger link state
* update. */
if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
&& is_vhost_running(dev)) {
netdev_change_seq_changed(&dev->up);
/* Clear statistics if device is getting up. */
if (NETDEV_UP & on) {
rte_spinlock_lock(&dev->stats_lock);
memset(&dev->stats, 0, sizeof dev->stats);
memset(dev->sw_stats, 0, sizeof *dev->sw_stats);
rte_spinlock_unlock(&dev->stats_lock);
}
}
}
return 0;
}
static int
netdev_dpdk_update_flags(struct netdev *netdev,
enum netdev_flags off, enum netdev_flags on,
enum netdev_flags *old_flagsp)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int error;
ovs_mutex_lock(&dev->mutex);
error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
int vid = netdev_dpdk_get_vid(dev);
if (vid < 0) {
smap_add_format(args, "status", "disconnected");
ovs_mutex_unlock(&dev->mutex);
return 0;
} else {
smap_add_format(args, "status", "connected");
}
char socket_name[PATH_MAX];
if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
smap_add_format(args, "socket", "%s", socket_name);
}
uint64_t features;
if (!rte_vhost_get_negotiated_features(vid, &features)) {
smap_add_format(args, "features", "0x%016"PRIx64, features);
}
uint16_t mtu;
if (!rte_vhost_get_mtu(vid, &mtu)) {
smap_add_format(args, "mtu", "%d", mtu);
}
int numa = rte_vhost_get_numa_node(vid);
if (numa >= 0) {
smap_add_format(args, "numa", "%d", numa);
}
uint16_t vring_num = rte_vhost_get_vring_num(vid);
if (vring_num) {
smap_add_format(args, "num_of_vrings", "%d", vring_num);
}
for (int i = 0; i < vring_num; i++) {
struct rte_vhost_vring vring;
rte_vhost_get_vhost_vring(vid, i, &vring);
smap_add_nocopy(args, xasprintf("vring_%d_size", i),
xasprintf("%d", vring.size));
}
if (userspace_tso_enabled()
&& dev->virtio_features_state & OVS_VIRTIO_F_WORKAROUND) {
smap_add_format(args, "userspace-tso", "disabled");
}
smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
smap_add_format(args, "n_txq", "%d", netdev->n_txq);
ovs_mutex_unlock(&dev->mutex);
return 0;
}
/*
* Convert a given uint32_t link speed defined in DPDK to a string
* equivalent.
*/
static const char *
netdev_dpdk_link_speed_to_str__(uint32_t link_speed)
{
switch (link_speed) {
case RTE_ETH_SPEED_NUM_10M: return "10Mbps";
case RTE_ETH_SPEED_NUM_100M: return "100Mbps";
case RTE_ETH_SPEED_NUM_1G: return "1Gbps";
case RTE_ETH_SPEED_NUM_2_5G: return "2.5Gbps";
case RTE_ETH_SPEED_NUM_5G: return "5Gbps";
case RTE_ETH_SPEED_NUM_10G: return "10Gbps";
case RTE_ETH_SPEED_NUM_20G: return "20Gbps";
case RTE_ETH_SPEED_NUM_25G: return "25Gbps";
case RTE_ETH_SPEED_NUM_40G: return "40Gbps";
case RTE_ETH_SPEED_NUM_50G: return "50Gbps";
case RTE_ETH_SPEED_NUM_56G: return "56Gbps";
case RTE_ETH_SPEED_NUM_100G: return "100Gbps";
default: return "Not Defined";
}
}
static int
netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
size_t rx_steer_flows_num;
uint64_t rx_steer_flags;
uint32_t link_speed;
int n_rxq;
int diag;
if (!rte_eth_dev_is_valid_port(dev->port_id)) {
return ENODEV;
}
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
diag = rte_eth_dev_info_get(dev->port_id, &dev_info);
link_speed = dev->link.link_speed;
rx_steer_flags = dev->rx_steer_flags;
rx_steer_flows_num = dev->rx_steer_flows_num;
n_rxq = netdev->n_rxq;
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
smap_add_format(args, "numa_id", "%d",
rte_eth_dev_socket_id(dev->port_id));
if (!diag) {
smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
}
smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
if (!diag) {
smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
smap_add_format(args, "max_hash_mac_addrs", "%u",
dev_info.max_hash_mac_addrs);
smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
}
smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
smap_add_format(args, "n_txq", "%d", netdev->n_txq);
smap_add(args, "rx_csum_offload",
dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD
? "true" : "false");
/* Querying the DPDK library for iftype may be done in future, pending
* support; cf. RFC 3635 Section 3.2.4. */
enum { IF_TYPE_ETHERNETCSMACD = 6 };
smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
smap_add_format(args, "if_descr", "%s %s", rte_version(),
diag < 0 ? "<unknown>" : dev_info.driver_name);
if (!diag) {
const char *bus_info = rte_dev_bus_info(dev_info.device);
smap_add_format(args, "bus_info", "bus_name=%s%s%s",
rte_bus_name(rte_dev_bus(dev_info.device)),
bus_info != NULL ? ", " : "",
bus_info != NULL ? bus_info : "");
}
/* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
* In that case the speed will not be reported as part of the usual
* call to get_features(). Get the link speed of the device and add it
* to the device status in an easy to read string format.
*/
smap_add(args, "link_speed",
netdev_dpdk_link_speed_to_str__(link_speed));
if (dev->is_representor) {
smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT,
ETH_ADDR_ARGS(dev->hwaddr));
}
if (rx_steer_flags && !rx_steer_flows_num) {
smap_add(args, "rx-steering", "unsupported");
} else if (rx_steer_flags == DPDK_RX_STEER_LACP) {
smap_add(args, "rx-steering", "rss+lacp");
} else {
ovs_assert(!rx_steer_flags);
smap_add(args, "rx-steering", "rss");
}
if (rx_steer_flags && rx_steer_flows_num) {
smap_add_format(args, "rx_steering_queue", "%d", n_rxq - 1);
if (n_rxq > 2) {
smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2);
} else {
smap_add(args, "rss_queues", "0");
}
}
return 0;
}
static void
netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
OVS_REQUIRES(dev->mutex)
{
enum netdev_flags old_flags;
if (admin_state) {
netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
} else {
netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
}
}
static void
netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
const char *argv[], void *aux OVS_UNUSED)
{
bool up;
if (!strcasecmp(argv[argc - 1], "up")) {
up = true;
} else if ( !strcasecmp(argv[argc - 1], "down")) {
up = false;
} else {
unixctl_command_reply_error(conn, "Invalid Admin State");
return;
}
if (argc > 2) {
struct netdev *netdev = netdev_from_name(argv[1]);
if (netdev && is_dpdk_class(netdev->netdev_class)) {
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
netdev_dpdk_set_admin_state__(dev, up);
ovs_mutex_unlock(&dev->mutex);
netdev_close(netdev);
} else {
unixctl_command_reply_error(conn, "Not a DPDK Interface");
netdev_close(netdev);
return;
}
} else {
struct netdev_dpdk *dev;
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
netdev_dpdk_set_admin_state__(dev, up);
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
}
unixctl_command_reply(conn, "OK");
}
static void
netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
const char *argv[], void *aux OVS_UNUSED)
{
struct ds used_interfaces = DS_EMPTY_INITIALIZER;
struct rte_eth_dev_info dev_info;
dpdk_port_t sibling_port_id;
dpdk_port_t port_id;
bool used = false;
char *response;
int diag;
ovs_mutex_lock(&dpdk_mutex);
port_id = netdev_dpdk_get_port_by_devargs(argv[1]);
if (!rte_eth_dev_is_valid_port(port_id)) {
response = xasprintf("Device '%s' not found in DPDK", argv[1]);
goto error;
}
ds_put_format(&used_interfaces,
"Device '%s' is being used by the following interfaces:",
argv[1]);
RTE_ETH_FOREACH_DEV_SIBLING (sibling_port_id, port_id) {
struct netdev_dpdk *dev;
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (dev->port_id != sibling_port_id) {
continue;
}
used = true;
ds_put_format(&used_interfaces, " %s",
netdev_get_name(&dev->up));
break;
}
}
if (used) {
ds_put_cstr(&used_interfaces, ". Remove them before detaching.");
response = ds_steal_cstr(&used_interfaces);
ds_destroy(&used_interfaces);
goto error;
}
ds_destroy(&used_interfaces);
diag = rte_eth_dev_info_get(port_id, &dev_info);
rte_eth_dev_close(port_id);
if (diag < 0 || rte_dev_remove(dev_info.device) < 0) {
response = xasprintf("Device '%s' can not be detached", argv[1]);
goto error;
}
response = xasprintf("All devices shared with device '%s' "
"have been detached", argv[1]);
ovs_mutex_unlock(&dpdk_mutex);
unixctl_command_reply(conn, response);
free(response);
return;
error:
ovs_mutex_unlock(&dpdk_mutex);
unixctl_command_reply_error(conn, response);
free(response);
}
static void
netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
int argc, const char *argv[],
void *aux OVS_UNUSED)
{
struct netdev *netdev = NULL;
const char *error = NULL;
char *response = NULL;
FILE *stream;
size_t size;
if (argc == 2) {
netdev = netdev_from_name(argv[1]);
if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
unixctl_command_reply_error(conn, "Not a DPDK Interface");
goto out;
}
}
stream = open_memstream(&response, &size);
if (!stream) {
response = xasprintf("Unable to open memstream: %s.",
ovs_strerror(errno));
unixctl_command_reply_error(conn, response);
goto out;
}
if (netdev) {
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ovs_mutex_lock(&dpdk_mp_mutex);
if (dev->dpdk_mp) {
rte_mempool_dump(stream, dev->dpdk_mp->mp);
fprintf(stream, " count: avail (%u), in use (%u)\n",
rte_mempool_avail_count(dev->dpdk_mp->mp),
rte_mempool_in_use_count(dev->dpdk_mp->mp));
} else {
error = "Not allocated";
}
ovs_mutex_unlock(&dpdk_mp_mutex);
ovs_mutex_unlock(&dev->mutex);
} else {
ovs_mutex_lock(&dpdk_mp_mutex);
rte_mempool_list_dump(stream);
ovs_mutex_unlock(&dpdk_mp_mutex);
}
fclose(stream);
if (error) {
unixctl_command_reply_error(conn, error);
} else {
unixctl_command_reply(conn, response);
}
out:
free(response);
netdev_close(netdev);
}
/*
* Set virtqueue flags so that we do not receive interrupts.
*/
static void
set_irq_status(int vid)
{
uint32_t i;
for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
rte_vhost_enable_guest_notification(vid, i, 0);
}
}
/*
* Fixes mapping for vhost-user tx queues. Must be called after each
* enabling/disabling of queues and n_txq modifications.
*/
static void
netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
int *enabled_queues, n_enabled = 0;
int i, k, total_txqs = dev->up.n_txq;
enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
for (i = 0; i < total_txqs; i++) {
/* Enabled queues always mapped to themselves. */
if (dev->tx_q[i].map == i) {
enabled_queues[n_enabled++] = i;
}
}
if (n_enabled == 0 && total_txqs != 0) {
enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
n_enabled = 1;
}
k = 0;
for (i = 0; i < total_txqs; i++) {
if (dev->tx_q[i].map != i) {
dev->tx_q[i].map = enabled_queues[k];
k = (k + 1) % n_enabled;
}
}
if (VLOG_IS_DBG_ENABLED()) {
struct ds mapping = DS_EMPTY_INITIALIZER;
ds_put_format(&mapping, "TX queue mapping for port '%s':\n",
netdev_get_name(&dev->up));
for (i = 0; i < total_txqs; i++) {
ds_put_format(&mapping, "%2d --> %2d\n", i, dev->tx_q[i].map);
}
VLOG_DBG("%s", ds_cstr(&mapping));
ds_destroy(&mapping);
}
free(enabled_queues);
}
/*
* A new virtio-net device is added to a vhost port.
*/
static int
new_device(int vid)
{
struct netdev_dpdk *dev;
bool exists = false;
int newnode = 0;
char ifname[IF_NAME_SZ];
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
/* Add device to the vhost port with the same name as that passed down. */
LIST_FOR_EACH(dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (nullable_string_is_equal(ifname, dev->vhost_id)) {
uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
uint64_t features;
/* Get NUMA information */
newnode = rte_vhost_get_numa_node(vid);
if (newnode == -1) {
#ifdef VHOST_NUMA
VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
ifname);
#endif
newnode = dev->socket_id;
}
dev->virtio_features_state |= OVS_VIRTIO_F_NEGOTIATED;
if (dev->requested_n_txq < qp_num
|| dev->requested_n_rxq < qp_num
|| dev->requested_socket_id != newnode
|| dev->dpdk_mp == NULL) {
dev->requested_socket_id = newnode;
dev->requested_n_rxq = qp_num;
dev->requested_n_txq = qp_num;
netdev_request_reconfigure(&dev->up);
} else {
/* Reconfiguration not required. */
dev->vhost_reconfigured = true;
}
if (rte_vhost_get_negotiated_features(vid, &features)) {
VLOG_INFO("Error checking guest features for "
"vHost Device '%s'", dev->vhost_id);
} else {
if (features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) {
dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD;
dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD;
dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD;
}
if (userspace_tso_enabled()
&& dev->virtio_features_state & OVS_VIRTIO_F_CLEAN) {
if (features & (1ULL << VIRTIO_NET_F_GUEST_TSO4)
&& features & (1ULL << VIRTIO_NET_F_GUEST_TSO6)) {
dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
VLOG_DBG("%s: TSO enabled on vhost port",
netdev_get_name(&dev->up));
} else {
VLOG_WARN("%s: Tx TSO offload is not supported.",
netdev_get_name(&dev->up));
}
}
}
/* There is no support in virtio net to offload IPv4 csum,
* but the vhost library handles IPv4 csum offloading fine. */
dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
netdev_dpdk_update_netdev_flags(dev);
ovsrcu_index_set(&dev->vid, vid);
exists = true;
/* Disable notifications. */
set_irq_status(vid);
netdev_change_seq_changed(&dev->up);
ovs_mutex_unlock(&dev->mutex);
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (!exists) {
VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
return -1;
}
VLOG_INFO("vHost Device '%s' has been added on numa node %i",
ifname, newnode);
return 0;
}
/* Clears mapping for all available queues of vhost interface. */
static void
netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
int i;
for (i = 0; i < dev->up.n_txq; i++) {
dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
}
}
/*
* Remove a virtio-net device from the specific vhost port. Use dev->remove
* flag to stop any more packets from being sent or received to/from a VM and
* ensure all currently queued packets have been sent/received before removing
* the device.
*/
static void
destroy_device(int vid)
{
struct netdev_dpdk *dev;
bool exists = false;
char ifname[IF_NAME_SZ];
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
if (netdev_dpdk_get_vid(dev) == vid) {
ovs_mutex_lock(&dev->mutex);
dev->vhost_reconfigured = false;
ovsrcu_index_set(&dev->vid, -1);
memset(dev->vhost_rxq_enabled, 0,
dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
netdev_dpdk_txq_map_clear(dev);
/* Clear offload capabilities before next new_device. */
dev->hw_ol_features = 0;
netdev_dpdk_update_netdev_flags(dev);
netdev_change_seq_changed(&dev->up);
ovs_mutex_unlock(&dev->mutex);
exists = true;
break;
}
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
/*
* Wait for other threads to quiesce after setting the 'virtio_dev'
* to NULL, before returning.
*/
ovsrcu_synchronize();
/*
* As call to ovsrcu_synchronize() will end the quiescent state,
* put thread back into quiescent state before returning.
*/
ovsrcu_quiesce_start();
VLOG_INFO("vHost Device '%s' has been removed", ifname);
} else {
VLOG_INFO("vHost Device '%s' not found", ifname);
}
}
static struct mpsc_queue vhost_state_change_queue
= MPSC_QUEUE_INITIALIZER(&vhost_state_change_queue);
static atomic_uint64_t vhost_state_change_queue_size;
struct vhost_state_change {
struct mpsc_queue_node node;
char ifname[IF_NAME_SZ];
uint16_t queue_id;
int enable;
};
static void
vring_state_changed__(struct vhost_state_change *sc)
{
struct netdev_dpdk *dev;
bool exists = false;
int qid = sc->queue_id / VIRTIO_QNUM;
bool is_rx = (sc->queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (nullable_string_is_equal(sc->ifname, dev->vhost_id)) {
if (is_rx) {
bool old_state = dev->vhost_rxq_enabled[qid];
dev->vhost_rxq_enabled[qid] = sc->enable != 0;
if (old_state != dev->vhost_rxq_enabled[qid]) {
netdev_change_seq_changed(&dev->up);
}
} else {
if (sc->enable) {
dev->tx_q[qid].map = qid;
} else {
dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
}
netdev_dpdk_remap_txqs(dev);
}
exists = true;
ovs_mutex_unlock(&dev->mutex);
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
"changed to \'%s\'", sc->queue_id, is_rx ? "rx" : "tx",
qid, sc->ifname, sc->enable == 1 ? "enabled" : "disabled");
} else {
VLOG_INFO("vHost Device '%s' not found", sc->ifname);
}
}
#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN 1
#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX 64
static void *
netdev_dpdk_vhost_events_main(void *arg OVS_UNUSED)
{
mpsc_queue_acquire(&vhost_state_change_queue);
for (;;) {
struct mpsc_queue_node *node;
uint64_t backoff;
backoff = NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN;
while (mpsc_queue_tail(&vhost_state_change_queue) == NULL) {
xnanosleep(backoff * 1E6);
if (backoff < NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX) {
backoff <<= 1;
}
}
MPSC_QUEUE_FOR_EACH_POP (node, &vhost_state_change_queue) {
struct vhost_state_change *sc;
sc = CONTAINER_OF(node, struct vhost_state_change, node);
vring_state_changed__(sc);
free(sc);
atomic_count_dec64(&vhost_state_change_queue_size);
}
}
OVS_NOT_REACHED();
mpsc_queue_release(&vhost_state_change_queue);
return NULL;
}
static int
vring_state_changed(int vid, uint16_t queue_id, int enable)
{
static struct vlog_rate_limit vhost_rl = VLOG_RATE_LIMIT_INIT(5, 5);
struct vhost_state_change *sc;
sc = xmalloc(sizeof *sc);
if (!rte_vhost_get_ifname(vid, sc->ifname, sizeof sc->ifname)) {
uint64_t queue_size;
sc->queue_id = queue_id;
sc->enable = enable;
mpsc_queue_insert(&vhost_state_change_queue, &sc->node);
queue_size = atomic_count_inc64(&vhost_state_change_queue_size);
if (queue_size >= 1000) {
VLOG_WARN_RL(&vhost_rl, "vring state change queue has %"PRIu64" "
"entries. Last update was for socket %s.", queue_size,
sc->ifname);
}
} else {
free(sc);
}
return 0;
}
static void
destroy_connection(int vid)
{
struct netdev_dpdk *dev;
char ifname[IF_NAME_SZ];
bool exists = false;
rte_vhost_get_ifname(vid, ifname, sizeof ifname);
ovs_mutex_lock(&dpdk_mutex);
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
ovs_mutex_lock(&dev->mutex);
if (nullable_string_is_equal(ifname, dev->vhost_id)) {
uint32_t qp_num = NR_QUEUE;
if (netdev_dpdk_get_vid(dev) >= 0) {
VLOG_ERR("Connection on socket '%s' destroyed while vhost "
"device still attached.", dev->vhost_id);
}
/* Restore the number of queue pairs to default. */
if (dev->requested_n_txq != qp_num
|| dev->requested_n_rxq != qp_num) {
dev->requested_n_rxq = qp_num;
dev->requested_n_txq = qp_num;
netdev_request_reconfigure(&dev->up);
}
if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) {
/* The socket disconnected before reaching new_device. It
* likely means that the guest did not agree with the virtio
* features. */
VLOG_WARN_RL(&rl, "Connection on socket '%s' closed during "
"initialization.", dev->vhost_id);
}
if (!(dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING)) {
switch (dev->virtio_features_state) {
case OVS_VIRTIO_F_CLEAN:
dev->virtio_features_state = OVS_VIRTIO_F_WORKAROUND;
break;
case OVS_VIRTIO_F_WORKAROUND:
dev->virtio_features_state = OVS_VIRTIO_F_CLEAN;
break;
case OVS_VIRTIO_F_CLEAN_NEGOTIATED:
/* The virtio features were clean and got accepted by the
* guest. We expect it will be the case in the future and
* change nothing. */
break;
case OVS_VIRTIO_F_WORKAROUND_NEGOTIATED:
/* Let's try to go with clean virtio features on a next
* connection. */
dev->virtio_features_state = OVS_VIRTIO_F_CLEAN;
break;
default:
OVS_NOT_REACHED();
}
if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) {
dev->virtio_features_state |= OVS_VIRTIO_F_RECONF_PENDING;
netdev_request_reconfigure(&dev->up);
}
}
ovs_mutex_unlock(&dev->mutex);
exists = true;
break;
}
ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
if (exists) {
VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
} else {
VLOG_INFO("vHost Device '%s' not found", ifname);
}
}
/*
* Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
* or vhostuserclient netdev.
*
* Returns a value greater or equal to zero for a valid vid or '-1' if
* there is no valid vid associated. A vid of '-1' must not be used in
* rte_vhost_ APi calls.
*
* Once obtained and validated, a vid can be used by a PMD for multiple
* subsequent rte_vhost API calls until the PMD quiesces. A PMD should
* not fetch the vid again for each of a series of API calls.
*/
int
netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
{
return ovsrcu_index_get(&dev->vid);
}
static int
netdev_dpdk_class_init(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
/* This function can be called for different classes. The initialization
* needs to be done only once */
if (ovsthread_once_start(&once)) {
int ret;
ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
unixctl_command_register("netdev-dpdk/set-admin-state",
"[netdev] up|down", 1, 2,
netdev_dpdk_set_admin_state, NULL);
unixctl_command_register("netdev-dpdk/detach",
"pci address of device", 1, 1,
netdev_dpdk_detach, NULL);
unixctl_command_register("netdev-dpdk/get-mempool-info",
"[netdev]", 0, 1,
netdev_dpdk_get_mempool_info, NULL);
netdev_dpdk_reset_seq = seq_create();
netdev_dpdk_last_reset_seq = seq_read(netdev_dpdk_reset_seq);
ret = rte_eth_dev_callback_register(RTE_ETH_ALL,
RTE_ETH_EVENT_INTR_RESET,
dpdk_eth_event_callback, NULL);
if (ret != 0) {
VLOG_ERR("Ethernet device callback register error: %s",
rte_strerror(-ret));
}
ovsthread_once_done(&once);
}
return 0;
}
static int
netdev_dpdk_vhost_class_init(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
if (ovsthread_once_start(&once)) {
ovs_thread_create("ovs_vhost", netdev_dpdk_vhost_events_main, NULL);
ovsthread_once_done(&once);
}
return 0;
}
/* QoS Functions */
struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
{
return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
}
/*
* Initialize QoS configuration operations.
*/
static void
qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
{
conf->ops = ops;
rte_spinlock_init(&conf->lock);
}
/*
* Search existing QoS operations in qos_ops and compare each set of
* operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
* else return NULL
*/
static const struct dpdk_qos_ops *
qos_lookup_name(const char *name)
{
const struct dpdk_qos_ops *const *opsp;
for (opsp = qos_confs; *opsp != NULL; opsp++) {
const struct dpdk_qos_ops *ops = *opsp;
if (!strcmp(name, ops->qos_name)) {
return ops;
}
}
return NULL;
}
static int
netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
struct sset *types)
{
const struct dpdk_qos_ops *const *opsp;
for (opsp = qos_confs; *opsp != NULL; opsp++) {
const struct dpdk_qos_ops *ops = *opsp;
if (ops->qos_construct && ops->qos_name[0] != '\0') {
sset_add(types, ops->qos_name);
}
}
return 0;
}
static int
netdev_dpdk_get_qos(const struct netdev *netdev,
const char **typep, struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf) {
*typep = qos_conf->ops->qos_name;
error = (qos_conf->ops->qos_get
? qos_conf->ops->qos_get(qos_conf, details): 0);
} else {
/* No QoS configuration set, return an empty string */
*typep = "";
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
const struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const struct dpdk_qos_ops *new_ops = NULL;
struct qos_conf *qos_conf, *new_qos_conf = NULL;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
new_ops = qos_lookup_name(type);
if (!new_ops || !new_ops->qos_construct) {
new_qos_conf = NULL;
if (type && type[0]) {
error = EOPNOTSUPP;
}
} else if (qos_conf && qos_conf->ops == new_ops
&& qos_conf->ops->qos_is_equal(qos_conf, details)) {
new_qos_conf = qos_conf;
} else {
error = new_ops->qos_construct(details, &new_qos_conf);
}
if (error) {
VLOG_ERR("Failed to set QoS type %s on port %s: %s",
type, netdev->name, rte_strerror(error));
}
if (new_qos_conf != qos_conf) {
ovsrcu_set(&dev->qos_conf, new_qos_conf);
if (qos_conf) {
ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
}
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_get_queue(const struct netdev *netdev, uint32_t queue_id,
struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (!qos_conf || !qos_conf->ops || !qos_conf->ops->qos_queue_get) {
error = EOPNOTSUPP;
} else {
error = qos_conf->ops->qos_queue_get(details, queue_id, qos_conf);
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_set_queue(struct netdev *netdev, uint32_t queue_id,
const struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (!qos_conf || !qos_conf->ops || !qos_conf->ops->qos_queue_construct) {
error = EOPNOTSUPP;
} else {
error = qos_conf->ops->qos_queue_construct(details, queue_id,
qos_conf);
}
if (error && error != EOPNOTSUPP) {
VLOG_ERR("Failed to set QoS queue %d on port %s: %s",
queue_id, netdev_get_name(netdev), rte_strerror(error));
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_delete_queue(struct netdev *netdev, uint32_t queue_id)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_destruct) {
qos_conf->ops->qos_queue_destruct(qos_conf, queue_id);
} else {
error = EOPNOTSUPP;
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_get_queue_stats(const struct netdev *netdev, uint32_t queue_id,
struct netdev_queue_stats *stats)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct qos_conf *qos_conf;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_get_stats) {
qos_conf->ops->qos_queue_get_stats(qos_conf, queue_id, stats);
} else {
error = EOPNOTSUPP;
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_queue_dump_start(const struct netdev *netdev, void **statep)
{
int error = 0;
struct qos_conf *qos_conf;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops
&& qos_conf->ops->qos_queue_dump_state_init) {
struct netdev_dpdk_queue_state *state;
*statep = state = xmalloc(sizeof *state);
error = qos_conf->ops->qos_queue_dump_state_init(qos_conf, state);
} else {
error = EOPNOTSUPP;
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_queue_dump_next(const struct netdev *netdev, void *state_,
uint32_t *queue_idp, struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct netdev_dpdk_queue_state *state = state_;
struct qos_conf *qos_conf;
int error = EOF;
ovs_mutex_lock(&dev->mutex);
while (state->cur_queue < state->n_queues) {
uint32_t queue_id = state->queues[state->cur_queue++];
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_get) {
*queue_idp = queue_id;
error = qos_conf->ops->qos_queue_get(details, queue_id, qos_conf);
break;
}
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
netdev_dpdk_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
void *state_)
{
struct netdev_dpdk_queue_state *state = state_;
free(state->queues);
free(state);
return 0;
}
/* egress-policer details */
struct egress_policer {
struct qos_conf qos_conf;
struct rte_meter_srtcm_params app_srtcm_params;
struct rte_meter_srtcm egress_meter;
struct rte_meter_srtcm_profile egress_prof;
};
static void
egress_policer_details_to_param(const struct smap *details,
struct rte_meter_srtcm_params *params)
{
memset(params, 0, sizeof *params);
params->cir = smap_get_ullong(details, "cir", 0);
params->cbs = smap_get_ullong(details, "cbs", 0);
params->ebs = 0;
}
static int
egress_policer_qos_construct(const struct smap *details,
struct qos_conf **conf)
{
struct egress_policer *policer;
int err = 0;
policer = xmalloc(sizeof *policer);
qos_conf_init(&policer->qos_conf, &egress_policer_ops);
egress_policer_details_to_param(details, &policer->app_srtcm_params);
err = rte_meter_srtcm_profile_config(&policer->egress_prof,
&policer->app_srtcm_params);
if (!err) {
err = rte_meter_srtcm_config(&policer->egress_meter,
&policer->egress_prof);
}
if (!err) {
*conf = &policer->qos_conf;
} else {
VLOG_ERR("Could not create rte meter for egress policer");
free(policer);
*conf = NULL;
err = -err;
}
return err;
}
static void
egress_policer_qos_destruct(struct qos_conf *conf)
{
struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
qos_conf);
free(policer);
}
static int
egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
{
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
return 0;
}
static bool
egress_policer_qos_is_equal(const struct qos_conf *conf,
const struct smap *details)
{
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
struct rte_meter_srtcm_params params;
egress_policer_details_to_param(details, &params);
return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
}
static int
egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int cnt = 0;
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
cnt = srtcm_policer_run_single_packet(&policer->egress_meter,
&policer->egress_prof, pkts,
pkt_cnt, should_steal);
return cnt;
}
static const struct dpdk_qos_ops egress_policer_ops = {
.qos_name = "egress-policer", /* qos_name */
.qos_construct = egress_policer_qos_construct,
.qos_destruct = egress_policer_qos_destruct,
.qos_get = egress_policer_qos_get,
.qos_is_equal = egress_policer_qos_is_equal,
.qos_run = egress_policer_run
};
/* trtcm-policer details */
struct trtcm_policer {
struct qos_conf qos_conf;
struct rte_meter_trtcm_rfc4115_params meter_params;
struct rte_meter_trtcm_rfc4115_profile meter_profile;
struct rte_meter_trtcm_rfc4115 meter;
struct netdev_queue_stats stats;
struct hmap queues;
};
struct trtcm_policer_queue {
struct hmap_node hmap_node;
uint32_t queue_id;
struct rte_meter_trtcm_rfc4115_params meter_params;
struct rte_meter_trtcm_rfc4115_profile meter_profile;
struct rte_meter_trtcm_rfc4115 meter;
struct netdev_queue_stats stats;
};
static void
trtcm_policer_details_to_param(const struct smap *details,
struct rte_meter_trtcm_rfc4115_params *params)
{
memset(params, 0, sizeof *params);
params->cir = smap_get_ullong(details, "cir", 0);
params->eir = smap_get_ullong(details, "eir", 0);
params->cbs = smap_get_ullong(details, "cbs", 0);
params->ebs = smap_get_ullong(details, "ebs", 0);
}
static void
trtcm_policer_param_to_detail(
const struct rte_meter_trtcm_rfc4115_params *params,
struct smap *details)
{
smap_add_format(details, "cir", "%"PRIu64, params->cir);
smap_add_format(details, "eir", "%"PRIu64, params->eir);
smap_add_format(details, "cbs", "%"PRIu64, params->cbs);
smap_add_format(details, "ebs", "%"PRIu64, params->ebs);
}
static int
trtcm_policer_qos_construct(const struct smap *details,
struct qos_conf **conf)
{
struct trtcm_policer *policer;
int err = 0;
policer = xmalloc(sizeof *policer);
qos_conf_init(&policer->qos_conf, &trtcm_policer_ops);
trtcm_policer_details_to_param(details, &policer->meter_params);
err = rte_meter_trtcm_rfc4115_profile_config(&policer->meter_profile,
&policer->meter_params);
if (!err) {
err = rte_meter_trtcm_rfc4115_config(&policer->meter,
&policer->meter_profile);
}
if (!err) {
*conf = &policer->qos_conf;
memset(&policer->stats, 0, sizeof policer->stats);
hmap_init(&policer->queues);
} else {
free(policer);
*conf = NULL;
err = -err;
}
return err;
}
static void
trtcm_policer_qos_destruct(struct qos_conf *conf)
{
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
HMAP_FOR_EACH_SAFE (queue, hmap_node, &policer->queues) {
hmap_remove(&policer->queues, &queue->hmap_node);
free(queue);
}
hmap_destroy(&policer->queues);
free(policer);
}
static int
trtcm_policer_qos_get(const struct qos_conf *conf, struct smap *details)
{
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
trtcm_policer_param_to_detail(&policer->meter_params, details);
return 0;
}
static bool
trtcm_policer_qos_is_equal(const struct qos_conf *conf,
const struct smap *details)
{
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
struct rte_meter_trtcm_rfc4115_params params;
trtcm_policer_details_to_param(details, &params);
return !memcmp(&params, &policer->meter_params, sizeof params);
}
static struct trtcm_policer_queue *
trtcm_policer_qos_find_queue(struct trtcm_policer *policer, uint32_t queue_id)
{
struct trtcm_policer_queue *queue;
HMAP_FOR_EACH_WITH_HASH (queue, hmap_node, hash_2words(queue_id, 0),
&policer->queues) {
if (queue->queue_id == queue_id) {
return queue;
}
}
return NULL;
}
static inline bool
trtcm_policer_run_single_packet(struct trtcm_policer *policer,
struct rte_mbuf *pkt, uint64_t time)
{
enum rte_color pkt_color;
struct trtcm_policer_queue *queue;
uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct rte_ether_hdr);
struct dp_packet *dpkt = CONTAINER_OF(pkt, struct dp_packet, mbuf);
queue = trtcm_policer_qos_find_queue(policer, dpkt->md.skb_priority);
if (!queue) {
/* If no queue is found, use the default queue, which MUST exist. */
queue = trtcm_policer_qos_find_queue(policer, 0);
if (!queue) {
return false;
}
}
pkt_color = rte_meter_trtcm_rfc4115_color_blind_check(&queue->meter,
&queue->meter_profile,
time,
pkt_len);
if (pkt_color == RTE_COLOR_RED) {
queue->stats.tx_errors++;
} else {
queue->stats.tx_bytes += pkt_len;
queue->stats.tx_packets++;
}
pkt_color = rte_meter_trtcm_rfc4115_color_aware_check(&policer->meter,
&policer->meter_profile,
time, pkt_len,
pkt_color);
if (pkt_color == RTE_COLOR_RED) {
policer->stats.tx_errors++;
return false;
}
policer->stats.tx_bytes += pkt_len;
policer->stats.tx_packets++;
return true;
}
static int
trtcm_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
bool should_steal)
{
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt = NULL;
uint64_t current_time = rte_rdtsc();
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
if (trtcm_policer_run_single_packet(policer, pkt, current_time)) {
if (cnt != i) {
pkts[cnt] = pkt;
}
cnt++;
} else {
if (should_steal) {
rte_pktmbuf_free(pkt);
}
}
}
return cnt;
}
static int
trtcm_policer_qos_queue_construct(const struct smap *details,
uint32_t queue_id, struct qos_conf *conf)
{
int err = 0;
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (!queue) {
queue = xmalloc(sizeof *queue);
queue->queue_id = queue_id;
memset(&queue->stats, 0, sizeof queue->stats);
queue->stats.created = time_msec();
hmap_insert(&policer->queues, &queue->hmap_node,
hash_2words(queue_id, 0));
}
if (queue_id == 0 && smap_is_empty(details)) {
/* No default queue configured, use port values */
memcpy(&queue->meter_params, &policer->meter_params,
sizeof queue->meter_params);
} else {
trtcm_policer_details_to_param(details, &queue->meter_params);
}
err = rte_meter_trtcm_rfc4115_profile_config(&queue->meter_profile,
&queue->meter_params);
if (!err) {
err = rte_meter_trtcm_rfc4115_config(&queue->meter,
&queue->meter_profile);
}
if (err) {
hmap_remove(&policer->queues, &queue->hmap_node);
free(queue);
err = -err;
}
return err;
}
static void
trtcm_policer_qos_queue_destruct(struct qos_conf *conf, uint32_t queue_id)
{
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (queue) {
hmap_remove(&policer->queues, &queue->hmap_node);
free(queue);
}
}
static int
trtcm_policer_qos_queue_get(struct smap *details, uint32_t queue_id,
const struct qos_conf *conf)
{
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (!queue) {
return EINVAL;
}
trtcm_policer_param_to_detail(&queue->meter_params, details);
return 0;
}
static int
trtcm_policer_qos_queue_get_stats(const struct qos_conf *conf,
uint32_t queue_id,
struct netdev_queue_stats *stats)
{
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
queue = trtcm_policer_qos_find_queue(policer, queue_id);
if (!queue) {
return EINVAL;
}
memcpy(stats, &queue->stats, sizeof *stats);
return 0;
}
static int
trtcm_policer_qos_queue_dump_state_init(const struct qos_conf *conf,
struct netdev_dpdk_queue_state *state)
{
uint32_t i = 0;
struct trtcm_policer_queue *queue;
struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
qos_conf);
state->n_queues = hmap_count(&policer->queues);
state->cur_queue = 0;
state->queues = xmalloc(state->n_queues * sizeof *state->queues);
HMAP_FOR_EACH (queue, hmap_node, &policer->queues) {
state->queues[i++] = queue->queue_id;
}
return 0;
}
static const struct dpdk_qos_ops trtcm_policer_ops = {
.qos_name = "trtcm-policer",
.qos_construct = trtcm_policer_qos_construct,
.qos_destruct = trtcm_policer_qos_destruct,
.qos_get = trtcm_policer_qos_get,
.qos_is_equal = trtcm_policer_qos_is_equal,
.qos_run = trtcm_policer_run,
.qos_queue_construct = trtcm_policer_qos_queue_construct,
.qos_queue_destruct = trtcm_policer_qos_queue_destruct,
.qos_queue_get = trtcm_policer_qos_queue_get,
.qos_queue_get_stats = trtcm_policer_qos_queue_get_stats,
.qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init
};
static int
dpdk_rx_steer_add_flow(struct netdev_dpdk *dev,
const struct rte_flow_item items[],
const char *desc)
{
const struct rte_flow_attr attr = { .ingress = 1 };
const struct rte_flow_action actions[] = {
{
.type = RTE_FLOW_ACTION_TYPE_QUEUE,
.conf = &(const struct rte_flow_action_queue) {
.index = dev->up.n_rxq - 1,
},
},
{ .type = RTE_FLOW_ACTION_TYPE_END },
};
struct rte_flow_error error;
struct rte_flow *flow;
size_t num;
int err;
set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
err = rte_flow_validate(dev->port_id, &attr, items, actions, &error);
if (err) {
VLOG_WARN("%s: rx-steering: device does not support %s flow: %s",
netdev_get_name(&dev->up), desc,
error.message ? error.message : "");
goto out;
}
set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
flow = rte_flow_create(dev->port_id, &attr, items, actions, &error);
if (flow == NULL) {
VLOG_WARN("%s: rx-steering: failed to add %s flow: %s",
netdev_get_name(&dev->up), desc,
error.message ? error.message : "");
err = rte_errno;
goto out;
}
num = dev->rx_steer_flows_num + 1;
dev->rx_steer_flows = xrealloc(dev->rx_steer_flows, num * sizeof flow);
dev->rx_steer_flows[dev->rx_steer_flows_num] = flow;
dev->rx_steer_flows_num = num;
VLOG_INFO("%s: rx-steering: redirected %s traffic to rx queue %d",
netdev_get_name(&dev->up), desc, dev->up.n_rxq - 1);
out:
return err;
}
#define RETA_CONF_SIZE (RTE_ETH_RSS_RETA_SIZE_512 / RTE_ETH_RETA_GROUP_SIZE)
static int
dpdk_rx_steer_rss_configure(struct netdev_dpdk *dev, int rss_n_rxq)
{
struct rte_eth_rss_reta_entry64 reta_conf[RETA_CONF_SIZE];
struct rte_eth_dev_info info;
int err;
err = rte_eth_dev_info_get(dev->port_id, &info);
if (err < 0) {
VLOG_WARN("%s: failed to query RSS info: %s",
netdev_get_name(&dev->up), rte_strerror(-err));
goto error;
}
if (info.reta_size % rss_n_rxq != 0 &&
info.reta_size < RTE_ETH_RSS_RETA_SIZE_128) {
/*
* Some drivers set reta_size equal to the total number of rxqs that
* are configured when it is a power of two. Since we are actually
* reconfiguring the redirection table to exclude the last rxq, we may
* end up with an imbalanced redirection table. For example, such
* configuration:
*
* options:n_rxq=3 options:rx-steering=rss+lacp
*
* Will actually configure 4 rxqs on the NIC, and the default reta to:
*
* [0, 1, 2, 3]
*
* And dpdk_rx_steer_rss_configure() will reconfigure reta to:
*
* [0, 1, 2, 0]
*
* Causing queue 0 to receive twice as much traffic as queues 1 and 2.
*
* Work around that corner case by forcing a bigger redirection table
* size to 128 entries when reta_size is not a multiple of rss_n_rxq
* and when reta_size is less than 128. This value seems to be
* supported by most of the drivers that also support rte_flow.
*/
info.reta_size = RTE_ETH_RSS_RETA_SIZE_128;
}
memset(reta_conf, 0, sizeof reta_conf);
for (uint16_t i = 0; i < info.reta_size; i++) {
uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
reta_conf[idx].mask |= 1ULL << shift;
reta_conf[idx].reta[shift] = i % rss_n_rxq;
}
err = rte_eth_dev_rss_reta_update(dev->port_id, reta_conf, info.reta_size);
if (err < 0) {
VLOG_WARN("%s: failed to configure RSS redirection table: err=%d",
netdev_get_name(&dev->up), err);
}
error:
return err;
}
static int
dpdk_rx_steer_configure(struct netdev_dpdk *dev)
{
int err = 0;
if (dev->up.n_rxq < 2) {
err = ENOTSUP;
VLOG_WARN("%s: rx-steering: not enough available rx queues",
netdev_get_name(&dev->up));
goto out;
}
if (dev->requested_rx_steer_flags & DPDK_RX_STEER_LACP) {
const struct rte_flow_item items[] = {
{
.type = RTE_FLOW_ITEM_TYPE_ETH,
.spec = &(const struct rte_flow_item_eth){
.type = htons(ETH_TYPE_LACP),
},
.mask = &(const struct rte_flow_item_eth){
.type = htons(0xffff),
},
},
{ .type = RTE_FLOW_ITEM_TYPE_END },
};
err = dpdk_rx_steer_add_flow(dev, items, "lacp");
if (err) {
goto out;
}
}
if (dev->rx_steer_flows_num) {
/* Reconfigure RSS reta in all but the rx steering queue. */
err = dpdk_rx_steer_rss_configure(dev, dev->up.n_rxq - 1);
if (err) {
goto out;
}
if (dev->up.n_rxq == 2) {
VLOG_INFO("%s: rx-steering: redirected other traffic to "
"rx queue 0", netdev_get_name(&dev->up));
} else {
VLOG_INFO("%s: rx-steering: applied rss on rx queues 0-%u",
netdev_get_name(&dev->up), dev->up.n_rxq - 2);
}
}
out:
return err;
}
static void
dpdk_rx_steer_unconfigure(struct netdev_dpdk *dev)
{
struct rte_flow_error error;
if (!dev->rx_steer_flows_num) {
return;
}
VLOG_DBG("%s: rx-steering: reset flows", netdev_get_name(&dev->up));
for (int i = 0; i < dev->rx_steer_flows_num; i++) {
set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
if (rte_flow_destroy(dev->port_id, dev->rx_steer_flows[i], &error)) {
VLOG_WARN("%s: rx-steering: failed to destroy flow: %s",
netdev_get_name(&dev->up),
error.message ? error.message : "");
}
}
free(dev->rx_steer_flows);
dev->rx_steer_flows_num = 0;
dev->rx_steer_flows = NULL;
/*
* Most DPDK drivers seem to reset their RSS redirection table in
* rte_eth_dev_configure() or rte_eth_dev_start(), both of which are
* called in dpdk_eth_dev_init(). No need to explicitly reset it.
*/
}
static int
netdev_dpdk_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bool pending_reset;
bool try_rx_steer;
int err = 0;
ovs_mutex_lock(&dev->mutex);
try_rx_steer = dev->requested_rx_steer_flags != 0;
dev->requested_n_rxq = dev->user_n_rxq;
if (try_rx_steer) {
dev->requested_n_rxq += 1;
}
atomic_read_relaxed(&netdev_dpdk_pending_reset[dev->port_id],
&pending_reset);
if (netdev->n_txq == dev->requested_n_txq
&& netdev->n_rxq == dev->requested_n_rxq
&& dev->rx_steer_flags == dev->requested_rx_steer_flags
&& dev->mtu == dev->requested_mtu
&& dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
&& dev->rxq_size == dev->requested_rxq_size
&& dev->txq_size == dev->requested_txq_size
&& eth_addr_equals(dev->hwaddr, dev->requested_hwaddr)
&& dev->socket_id == dev->requested_socket_id
&& dev->started && !pending_reset) {
/* Reconfiguration is unnecessary */
goto out;
}
retry:
dpdk_rx_steer_unconfigure(dev);
if (pending_reset) {
/*
* Set false before reset to avoid missing a new reset interrupt event
* in a race with event callback.
*/
atomic_store_relaxed(&netdev_dpdk_pending_reset[dev->port_id], false);
rte_eth_dev_reset(dev->port_id);
if_notifier_manual_report();
} else {
rte_eth_dev_stop(dev->port_id);
}
dev->started = false;
err = netdev_dpdk_mempool_configure(dev);
if (err && err != EEXIST) {
goto out;
}
dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;
netdev->n_txq = dev->requested_n_txq;
netdev->n_rxq = dev->requested_n_rxq;
dev->rxq_size = dev->requested_rxq_size;
dev->txq_size = dev->requested_txq_size;
rte_free(dev->tx_q);
dev->tx_q = NULL;
if (!eth_addr_equals(dev->hwaddr, dev->requested_hwaddr)) {
err = netdev_dpdk_set_etheraddr__(dev, dev->requested_hwaddr);
if (err) {
goto out;
}
}
err = dpdk_eth_dev_init(dev);
if (err) {
goto out;
}
netdev_dpdk_update_netdev_flags(dev);
/* If both requested and actual hwaddr were previously
* unset (initialized to 0), then first device init above
* will have set actual hwaddr to something new.
* This would trigger spurious MAC reconfiguration unless
* the requested MAC is kept in sync.
*
* This is harmless in case requested_hwaddr was
* configured by the user, as netdev_dpdk_set_etheraddr__()
* will have succeeded to get to this point.
*/
dev->requested_hwaddr = dev->hwaddr;
if (try_rx_steer) {
err = dpdk_rx_steer_configure(dev);
if (err) {
/* No hw support, disable & recover gracefully. */
try_rx_steer = false;
/*
* The extra queue must be explicitly removed here to ensure that
* it is unconfigured immediately.
*/
dev->requested_n_rxq = dev->user_n_rxq;
goto retry;
}
} else {
VLOG_INFO("%s: rx-steering: default rss", netdev_get_name(&dev->up));
}
dev->rx_steer_flags = dev->requested_rx_steer_flags;
dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
if (!dev->tx_q) {
err = ENOMEM;
}
netdev_change_seq_changed(netdev);
out:
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
dev->up.n_txq = dev->requested_n_txq;
dev->up.n_rxq = dev->requested_n_rxq;
/* Always keep RX queue 0 enabled for implementations that won't
* report vring states. */
dev->vhost_rxq_enabled[0] = true;
/* Enable TX queue 0 by default if it wasn't disabled. */
if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
dev->tx_q[0].map = 0;
}
rte_spinlock_lock(&dev->stats_lock);
memset(&dev->stats, 0, sizeof dev->stats);
memset(dev->sw_stats, 0, sizeof *dev->sw_stats);
rte_spinlock_unlock(&dev->stats_lock);
netdev_dpdk_remap_txqs(dev);
if (netdev_dpdk_get_vid(dev) >= 0) {
int err;
err = netdev_dpdk_mempool_configure(dev);
if (!err) {
/* A new mempool was created or re-used. */
netdev_change_seq_changed(&dev->up);
} else if (err != EEXIST) {
return err;
}
if (dev->vhost_reconfigured == false) {
dev->vhost_reconfigured = true;
/* Carrier status may need updating. */
netdev_change_seq_changed(&dev->up);
}
}
netdev_dpdk_update_netdev_flags(dev);
return 0;
}
static int
netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err;
ovs_mutex_lock(&dev->mutex);
err = dpdk_vhost_reconfigure_helper(dev);
ovs_mutex_unlock(&dev->mutex);
return err;
}
static int
netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bool unregister = false;
char *vhost_id;
int err;
ovs_mutex_lock(&dev->mutex);
if (dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT && dev->vhost_id
&& dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING) {
/* This vhost-user port was registered to the vhost library already,
* but a socket disconnection happened and configuration must be
* re-evaluated wrt dev->virtio_features_state. */
dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
vhost_id = dev->vhost_id;
unregister = true;
}
ovs_mutex_unlock(&dev->mutex);
if (unregister) {
dpdk_vhost_driver_unregister(dev, vhost_id);
}
ovs_mutex_lock(&dev->mutex);
/* Configure vHost client mode if requested and if the following criteria
* are met:
* 1. Device hasn't been registered yet.
* 2. A path has been specified.
*/
if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) {
uint64_t virtio_unsup_features = 0;
uint64_t vhost_flags = 0;
bool enable_tso;
enable_tso = userspace_tso_enabled()
&& dev->virtio_features_state & OVS_VIRTIO_F_CLEAN;
dev->virtio_features_state &= ~OVS_VIRTIO_F_RECONF_PENDING;
/* Register client-mode device. */
vhost_flags |= RTE_VHOST_USER_CLIENT;
/* Extended per vq statistics. */
vhost_flags |= RTE_VHOST_USER_NET_STATS_ENABLE;
/* There is no support for multi-segments buffers. */
vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
/* Enable IOMMU support, if explicitly requested. */
if (vhost_iommu_enabled) {
vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
}
/* Enable POSTCOPY support, if explicitly requested. */
if (vhost_postcopy_enabled) {
vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
}
/* Use "compliant" ol_flags API so that the vhost library behaves
* like a DPDK ethdev driver. */
vhost_flags |= RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
/* Enable External Buffers if TCP Segmentation Offload is enabled. */
if (enable_tso) {
vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
}
err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
if (err) {
VLOG_ERR("vhost-user device setup failure for device %s\n",
dev->vhost_id);
goto unlock;
} else {
/* Configuration successful */
dev->vhost_driver_flags |= vhost_flags;
VLOG_INFO("vHost User device '%s' created in 'client' mode, "
"using client socket '%s'",
dev->up.name, dev->vhost_id);
}
err = rte_vhost_driver_callback_register(dev->vhost_id,
&virtio_net_device_ops);
if (err) {
VLOG_ERR("rte_vhost_driver_callback_register failed for "
"vhost user client port: %s\n", dev->up.name);
goto unlock;
}
if (enable_tso) {
virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_ECN
| 1ULL << VIRTIO_NET_F_HOST_UFO;
VLOG_DBG("%s: TSO enabled on vhost port",
netdev_get_name(&dev->up));
} else {
/* Advertise checksum offloading to the guest, but explicitly
* disable TSO and friends.
* NOTE: we can't disable HOST_ECN which may have been wrongly
* negotiated by a running guest. */
virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_TSO4
| 1ULL << VIRTIO_NET_F_HOST_TSO6
| 1ULL << VIRTIO_NET_F_HOST_UFO;
}
err = rte_vhost_driver_disable_features(dev->vhost_id,
virtio_unsup_features);
if (err) {
VLOG_ERR("rte_vhost_driver_disable_features failed for "
"vhost user client port: %s\n", dev->up.name);
goto unlock;
}
/* Setting max queue pairs is only useful and effective with VDUSE. */
if (strncmp(dev->vhost_id, "/dev/vduse/", 11) == 0) {
uint32_t max_qp = dev->vhost_max_queue_pairs;
err = rte_vhost_driver_set_max_queue_num(dev->vhost_id, max_qp);
if (err) {
VLOG_ERR("rte_vhost_driver_set_max_queue_num failed for "
"vhost-user client port: %s\n", dev->up.name);
goto unlock;
}
}
err = rte_vhost_driver_start(dev->vhost_id);
if (err) {
VLOG_ERR("rte_vhost_driver_start failed for vhost user "
"client port: %s\n", dev->up.name);
goto unlock;
}
}
err = dpdk_vhost_reconfigure_helper(dev);
unlock:
ovs_mutex_unlock(&dev->mutex);
return err;
}
int
netdev_dpdk_get_port_id(struct netdev *netdev)
{
struct netdev_dpdk *dev;
int ret = -1;
if (!is_dpdk_class(netdev->netdev_class)) {
goto out;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = dev->port_id;
ovs_mutex_unlock(&dev->mutex);
out:
return ret;
}
bool
netdev_dpdk_flow_api_supported(struct netdev *netdev)
{
struct netdev_dpdk *dev;
bool ret = false;
if ((!strcmp(netdev_get_type(netdev), "vxlan") ||
!strcmp(netdev_get_type(netdev), "gre")) &&
!strcmp(netdev_get_dpif_type(netdev), "netdev")) {
ret = true;
goto out;
}
if (!is_dpdk_class(netdev->netdev_class)) {
goto out;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->type == DPDK_DEV_ETH) {
if (dev->requested_rx_steer_flags) {
VLOG_WARN("%s: rx-steering is mutually exclusive with hw-offload,"
" falling back to default rss mode",
netdev_get_name(netdev));
dev->requested_rx_steer_flags = 0;
netdev_request_reconfigure(netdev);
}
/* TODO: Check if we able to offload some minimal flow. */
ret = true;
}
ovs_mutex_unlock(&dev->mutex);
out:
return ret;
}
int
netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
struct rte_flow *rte_flow,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int ret;
ret = rte_flow_destroy(dev->port_id, rte_flow, error);
return ret;
}
struct rte_flow *
netdev_dpdk_rte_flow_create(struct netdev *netdev,
const struct rte_flow_attr *attr,
const struct rte_flow_item *items,
const struct rte_flow_action *actions,
struct rte_flow_error *error)
{
struct rte_flow *flow;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
flow = rte_flow_create(dev->port_id, attr, items, actions, error);
return flow;
}
int
netdev_dpdk_rte_flow_query_count(struct netdev *netdev,
struct rte_flow *rte_flow,
struct rte_flow_query_count *query,
struct rte_flow_error *error)
{
struct rte_flow_action_count count = { .id = 0, };
const struct rte_flow_action actions[] = {
{
.type = RTE_FLOW_ACTION_TYPE_COUNT,
.conf = &count,
},
{
.type = RTE_FLOW_ACTION_TYPE_END,
},
};
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
return ret;
}
#ifdef ALLOW_EXPERIMENTAL_API
int
netdev_dpdk_rte_flow_tunnel_decap_set(struct netdev *netdev,
struct rte_flow_tunnel *tunnel,
struct rte_flow_action **actions,
uint32_t *num_of_actions,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
num_of_actions, error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_tunnel_match(struct netdev *netdev,
struct rte_flow_tunnel *tunnel,
struct rte_flow_item **items,
uint32_t *num_of_items,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_match(dev->port_id, tunnel, items, num_of_items,
error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_get_restore_info(struct netdev *netdev,
struct dp_packet *p,
struct rte_flow_restore_info *info,
struct rte_flow_error *error)
{
struct rte_mbuf *m = (struct rte_mbuf *) p;
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_tunnel_action_decap_release(
struct netdev *netdev,
struct rte_flow_action *actions,
uint32_t num_of_actions,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
num_of_actions, error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
int
netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev,
struct rte_flow_item *items,
uint32_t num_of_items,
struct rte_flow_error *error)
{
struct netdev_dpdk *dev;
int ret;
if (!is_dpdk_class(netdev->netdev_class)) {
return -1;
}
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
ret = rte_flow_tunnel_item_release(dev->port_id, items, num_of_items,
error);
ovs_mutex_unlock(&dev->mutex);
return ret;
}
#endif /* ALLOW_EXPERIMENTAL_API */
static void
parse_mempool_config(const struct smap *ovs_other_config)
{
per_port_memory = smap_get_bool(ovs_other_config,
"per-port-memory", false);
VLOG_INFO("Per port memory for DPDK devices %s.",
per_port_memory ? "enabled" : "disabled");
}
static void
parse_user_mempools_list(const struct smap *ovs_other_config)
{
const char *mtus = smap_get(ovs_other_config, "shared-mempool-config");
char *list, *copy, *key, *value;
int error = 0;
if (!mtus) {
return;
}
n_user_mempools = 0;
list = copy = xstrdup(mtus);
while (ofputil_parse_key_value(&list, &key, &value)) {
int socket_id, mtu, adj_mtu;
if (!str_to_int(key, 0, &mtu) || mtu < 0) {
error = EINVAL;
VLOG_WARN("Invalid user configured shared mempool MTU.");
break;
}
if (!str_to_int(value, 0, &socket_id)) {
/* No socket specified. It will apply for all numas. */
socket_id = INT_MAX;
} else if (socket_id < 0) {
error = EINVAL;
VLOG_WARN("Invalid user configured shared mempool NUMA.");
break;
}
user_mempools = xrealloc(user_mempools, (n_user_mempools + 1) *
sizeof(struct user_mempool_config));
adj_mtu = FRAME_LEN_TO_MTU(dpdk_buf_size(mtu));
user_mempools[n_user_mempools].adj_mtu = adj_mtu;
user_mempools[n_user_mempools].socket_id = socket_id;
n_user_mempools++;
VLOG_INFO("User configured shared mempool set for: MTU %d, NUMA %s.",
mtu, socket_id == INT_MAX ? "ALL" : value);
}
if (error) {
VLOG_WARN("User configured shared mempools will not be used.");
n_user_mempools = 0;
free(user_mempools);
user_mempools = NULL;
}
free(copy);
}
static int
process_vhost_flags(char *flag, const char *default_val, int size,
const struct smap *ovs_other_config,
char **new_val)
{
const char *val;
int changed = 0;
val = smap_get(ovs_other_config, flag);
/* Process the vhost-sock-dir flag if it is provided, otherwise resort to
* default value.
*/
if (val && (strlen(val) <= size)) {
changed = 1;
*new_val = xstrdup(val);
VLOG_INFO("User-provided %s in use: %s", flag, *new_val);
} else {
VLOG_INFO("No %s provided - defaulting to %s", flag, default_val);
*new_val = xstrdup(default_val);
}
return changed;
}
static void
parse_vhost_config(const struct smap *ovs_other_config)
{
char *sock_dir_subcomponent;
if (process_vhost_flags("vhost-sock-dir", ovs_rundir(),
NAME_MAX, ovs_other_config,
&sock_dir_subcomponent)) {
struct stat s;
if (!strstr(sock_dir_subcomponent, "..")) {
vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(),
sock_dir_subcomponent);
if (stat(vhost_sock_dir, &s)) {
VLOG_ERR("vhost-user sock directory '%s' does not exist.",
vhost_sock_dir);
}
} else {
vhost_sock_dir = xstrdup(ovs_rundir());
VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid"
"characters '..' - using %s instead.",
ovs_rundir(), sock_dir_subcomponent, ovs_rundir());
}
free(sock_dir_subcomponent);
} else {
vhost_sock_dir = sock_dir_subcomponent;
}
vhost_iommu_enabled = smap_get_bool(ovs_other_config,
"vhost-iommu-support", false);
VLOG_INFO("IOMMU support for vhost-user-client %s.",
vhost_iommu_enabled ? "enabled" : "disabled");
vhost_postcopy_enabled = smap_get_bool(ovs_other_config,
"vhost-postcopy-support", false);
if (vhost_postcopy_enabled && memory_all_locked()) {
VLOG_WARN("vhost-postcopy-support and mlockall are not compatible.");
vhost_postcopy_enabled = false;
}
VLOG_INFO("POSTCOPY support for vhost-user-client %s.",
vhost_postcopy_enabled ? "enabled" : "disabled");
}
#define NETDEV_DPDK_CLASS_COMMON \
.is_pmd = true, \
.alloc = netdev_dpdk_alloc, \
.dealloc = netdev_dpdk_dealloc, \
.get_numa_id = netdev_dpdk_get_numa_id, \
.set_etheraddr = netdev_dpdk_set_etheraddr, \
.get_etheraddr = netdev_dpdk_get_etheraddr, \
.get_mtu = netdev_dpdk_get_mtu, \
.set_mtu = netdev_dpdk_set_mtu, \
.get_ifindex = netdev_dpdk_get_ifindex, \
.get_carrier_resets = netdev_dpdk_get_carrier_resets, \
.set_miimon_interval = netdev_dpdk_set_miimon, \
.set_policing = netdev_dpdk_set_policing, \
.get_qos_types = netdev_dpdk_get_qos_types, \
.get_qos = netdev_dpdk_get_qos, \
.set_qos = netdev_dpdk_set_qos, \
.get_queue = netdev_dpdk_get_queue, \
.set_queue = netdev_dpdk_set_queue, \
.delete_queue = netdev_dpdk_delete_queue, \
.get_queue_stats = netdev_dpdk_get_queue_stats, \
.queue_dump_start = netdev_dpdk_queue_dump_start, \
.queue_dump_next = netdev_dpdk_queue_dump_next, \
.queue_dump_done = netdev_dpdk_queue_dump_done, \
.update_flags = netdev_dpdk_update_flags, \
.rxq_alloc = netdev_dpdk_rxq_alloc, \
.rxq_construct = netdev_dpdk_rxq_construct, \
.rxq_destruct = netdev_dpdk_rxq_destruct, \
.rxq_dealloc = netdev_dpdk_rxq_dealloc
#define NETDEV_DPDK_CLASS_BASE \
NETDEV_DPDK_CLASS_COMMON, \
.init = netdev_dpdk_class_init, \
.run = netdev_dpdk_run, \
.wait = netdev_dpdk_wait, \
.destruct = netdev_dpdk_destruct, \
.set_tx_multiq = netdev_dpdk_set_tx_multiq, \
.get_carrier = netdev_dpdk_get_carrier, \
.get_stats = netdev_dpdk_get_stats, \
.get_custom_stats = netdev_dpdk_get_custom_stats, \
.get_features = netdev_dpdk_get_features, \
.get_speed = netdev_dpdk_get_speed, \
.get_status = netdev_dpdk_get_status, \
.reconfigure = netdev_dpdk_reconfigure, \
.rxq_recv = netdev_dpdk_rxq_recv
static const struct netdev_class dpdk_class = {
.type = "dpdk",
NETDEV_DPDK_CLASS_BASE,
.construct = netdev_dpdk_construct,
.get_config = netdev_dpdk_get_config,
.set_config = netdev_dpdk_set_config,
.send = netdev_dpdk_eth_send,
};
static const struct netdev_class dpdk_vhost_class = {
.type = "dpdkvhostuser",
NETDEV_DPDK_CLASS_COMMON,
.init = netdev_dpdk_vhost_class_init,
.construct = netdev_dpdk_vhost_construct,
.destruct = netdev_dpdk_vhost_destruct,
.send = netdev_dpdk_vhost_send,
.get_carrier = netdev_dpdk_vhost_get_carrier,
.get_stats = netdev_dpdk_vhost_get_stats,
.get_custom_stats = netdev_dpdk_vhost_get_custom_stats,
.get_status = netdev_dpdk_vhost_user_get_status,
.reconfigure = netdev_dpdk_vhost_reconfigure,
.rxq_recv = netdev_dpdk_vhost_rxq_recv,
.rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
};
static const struct netdev_class dpdk_vhost_client_class = {
.type = "dpdkvhostuserclient",
NETDEV_DPDK_CLASS_COMMON,
.init = netdev_dpdk_vhost_class_init,
.construct = netdev_dpdk_vhost_client_construct,
.destruct = netdev_dpdk_vhost_destruct,
.get_config = netdev_dpdk_vhost_client_get_config,
.set_config = netdev_dpdk_vhost_client_set_config,
.send = netdev_dpdk_vhost_send,
.get_carrier = netdev_dpdk_vhost_get_carrier,
.get_stats = netdev_dpdk_vhost_get_stats,
.get_custom_stats = netdev_dpdk_vhost_get_custom_stats,
.get_status = netdev_dpdk_vhost_user_get_status,
.reconfigure = netdev_dpdk_vhost_client_reconfigure,
.rxq_recv = netdev_dpdk_vhost_rxq_recv,
.rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
};
void
netdev_dpdk_register(const struct smap *ovs_other_config)
{
parse_mempool_config(ovs_other_config);
parse_user_mempools_list(ovs_other_config);
parse_vhost_config(ovs_other_config);
netdev_register_provider(&dpdk_class);
netdev_register_provider(&dpdk_vhost_class);
netdev_register_provider(&dpdk_vhost_client_class);
}