2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00

netdev-afxdp: Add need_wakeup support.

The patch adds support for using need_wakeup flag in AF_XDP rings.
A new option, use-need-wakeup, is added.  When this option is used,
it means that OVS has to explicitly wake up the kernel RX, using poll()
syscall and wake up TX, using sendto() syscall. This feature improves
the performance by avoiding unnecessary sendto syscalls for TX.
For RX, instead of kernel always busy-spinning on fille queue, OVS wakes
up the kernel RX processing when fill queue is replenished.

The need_wakeup feature is merged into Linux kernel bpf-next tee with commit
77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP rings") and
OVS enables it by default, if libbpf supports it.  If users enable it but
runs in an older version of libbpf, then the need_wakeup feature has no effect,
and a warning message is logged.

For virtual interface, it's better set use-need-wakeup=false, since
the virtual device's AF_XDP xmit is synchronous: the sendto syscall
enters kernel and process the TX packet on tx queue directly.

On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
to physical port improves from 6.1Mpps to 7.3Mpps.

Suggested-by: Ilya Maximets <i.maximets@ovn.org>
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
This commit is contained in:
William Tu 2019-10-23 14:06:01 -07:00 committed by Ilya Maximets
parent ed1617406c
commit e50547b51a
6 changed files with 135 additions and 16 deletions

View File

@ -176,9 +176,17 @@ in :doc:`general`::
ovs-vswitchd ... ovs-vswitchd ...
ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev
Make sure your device driver support AF_XDP, and to use 1 PMD (on core 4) Make sure your device driver support AF_XDP, netdev-afxdp supports
on 1 queue (queue 0) device, configure these options: **pmd-cpu-mask, the following additional options (see man ovs-vswitchd.conf.db for
pmd-rxq-affinity, and n_rxq**. The **xdpmode** can be "drv" or "skb":: more details):
* **xdpmode**: use "drv" for driver mode, or "skb" for skb mode.
* **use-need-wakeup**: default "true" if libbpf supports it, otherwise false.
For example, to use 1 PMD (on core 4) on 1 queue (queue 0) device,
configure these options: **pmd-cpu-mask, pmd-rxq-affinity, and n_rxq**.
The **xdpmode** can be "drv" or "skb"::
ethtool -L enp2s0 combined 1 ethtool -L enp2s0 combined 1
ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x10 ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x10

3
NEWS
View File

@ -5,6 +5,9 @@ Post-v2.12.0
separate project. You can find it at separate project. You can find it at
https://github.com/ovn-org/ovn.git https://github.com/ovn-org/ovn.git
- Userspace datapath: - Userspace datapath:
* New option 'use-need-wakeup' for netdev-afxdp to control enabling
of corresponding 'need_wakeup' flag in AF_XDP rings. Enabled by default
if supported by libbpf.
* Add option to enable, disable and query TCP sequence checking in * Add option to enable, disable and query TCP sequence checking in
conntrack. conntrack.

View File

@ -276,6 +276,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
[Define to 1 if AF_XDP support is available and enabled.]) [Define to 1 if AF_XDP support is available and enabled.])
LIBBPF_LDADD=" -lbpf -lelf" LIBBPF_LDADD=" -lbpf -lelf"
AC_SUBST([LIBBPF_LDADD]) AC_SUBST([LIBBPF_LDADD])
AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
[XDP need wakeup support detected in xsk.h.])
], [], [[#include <bpf/xsk.h>]])
fi fi
AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true) AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
]) ])

View File

@ -26,6 +26,7 @@
#include <linux/rtnetlink.h> #include <linux/rtnetlink.h>
#include <linux/if_xdp.h> #include <linux/if_xdp.h>
#include <net/if.h> #include <net/if.h>
#include <poll.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/resource.h> #include <sys/resource.h>
#include <sys/socket.h> #include <sys/socket.h>
@ -67,6 +68,12 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
#define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS #define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS
#define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS #define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
#ifdef HAVE_XDP_NEED_WAKEUP
#define NEED_WAKEUP_DEFAULT true
#else
#define NEED_WAKEUP_DEFAULT false
#endif
/* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets /* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets
* still on processing in threads. Number of packets currently in OVS * still on processing in threads. Number of packets currently in OVS
* processing is hard to estimate because it depends on number of ports. * processing is hard to estimate because it depends on number of ports.
@ -82,7 +89,7 @@ BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
#define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char *)base)) #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char *)base))
static struct xsk_socket_info *xsk_configure(int ifindex, int xdp_queue_id, static struct xsk_socket_info *xsk_configure(int ifindex, int xdp_queue_id,
int mode); int mode, bool use_need_wakeup);
static void xsk_remove_xdp_program(uint32_t ifindex, int xdpmode); static void xsk_remove_xdp_program(uint32_t ifindex, int xdpmode);
static void xsk_destroy(struct xsk_socket_info *xsk); static void xsk_destroy(struct xsk_socket_info *xsk);
static int xsk_configure_all(struct netdev *netdev); static int xsk_configure_all(struct netdev *netdev);
@ -117,6 +124,54 @@ struct xsk_socket_info {
atomic_uint64_t tx_dropped; atomic_uint64_t tx_dropped;
}; };
#ifdef HAVE_XDP_NEED_WAKEUP
static inline void
xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
struct netdev *netdev, int fd)
{
struct netdev_linux *dev = netdev_linux_cast(netdev);
struct pollfd pfd;
int ret;
if (!dev->use_need_wakeup) {
return;
}
if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
pfd.fd = fd;
pfd.events = POLLIN;
ret = poll(&pfd, 1, 0);
if (OVS_UNLIKELY(ret < 0)) {
VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.",
netdev_get_name(netdev),
ovs_strerror(errno));
}
}
}
static inline bool
xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info)
{
return xsk_ring_prod__needs_wakeup(&xsk_info->tx);
}
#else /* !HAVE_XDP_NEED_WAKEUP */
static inline void
xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem OVS_UNUSED,
struct netdev *netdev OVS_UNUSED,
int fd OVS_UNUSED)
{
/* Nothing. */
}
static inline bool
xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED)
{
return true;
}
#endif /* HAVE_XDP_NEED_WAKEUP */
static void static void
netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool) netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
{ {
@ -235,7 +290,7 @@ xsk_configure_umem(void *buffer, uint64_t size, int xdpmode)
static struct xsk_socket_info * static struct xsk_socket_info *
xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex, xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
uint32_t queue_id, int xdpmode) uint32_t queue_id, int xdpmode, bool use_need_wakeup)
{ {
struct xsk_socket_config cfg; struct xsk_socket_config cfg;
struct xsk_socket_info *xsk; struct xsk_socket_info *xsk;
@ -258,6 +313,12 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE; cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE;
} }
#ifdef HAVE_XDP_NEED_WAKEUP
if (use_need_wakeup) {
cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
}
#endif
if (if_indextoname(ifindex, devname) == NULL) { if (if_indextoname(ifindex, devname) == NULL) {
VLOG_ERR("ifindex %d to devname failed (%s)", VLOG_ERR("ifindex %d to devname failed (%s)",
ifindex, ovs_strerror(errno)); ifindex, ovs_strerror(errno));
@ -268,9 +329,11 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
ret = xsk_socket__create(&xsk->xsk, devname, queue_id, umem->umem, ret = xsk_socket__create(&xsk->xsk, devname, queue_id, umem->umem,
&xsk->rx, &xsk->tx, &cfg); &xsk->rx, &xsk->tx, &cfg);
if (ret) { if (ret) {
VLOG_ERR("xsk_socket__create failed (%s) mode: %s qid: %d", VLOG_ERR("xsk_socket__create failed (%s) mode: %s "
"use-need-wakeup: %s qid: %d",
ovs_strerror(errno), ovs_strerror(errno),
xdpmode == XDP_COPY ? "SKB": "DRV", xdpmode == XDP_COPY ? "SKB": "DRV",
use_need_wakeup ? "true" : "false",
queue_id); queue_id);
free(xsk); free(xsk);
return NULL; return NULL;
@ -312,7 +375,8 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
} }
static struct xsk_socket_info * static struct xsk_socket_info *
xsk_configure(int ifindex, int xdp_queue_id, int xdpmode) xsk_configure(int ifindex, int xdp_queue_id, int xdpmode,
bool use_need_wakeup)
{ {
struct xsk_socket_info *xsk; struct xsk_socket_info *xsk;
struct xsk_umem_info *umem; struct xsk_umem_info *umem;
@ -335,7 +399,8 @@ xsk_configure(int ifindex, int xdp_queue_id, int xdpmode)
VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem); VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode); xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode,
use_need_wakeup);
if (!xsk) { if (!xsk) {
/* Clean up umem and xpacket pool. */ /* Clean up umem and xpacket pool. */
if (xsk_umem__delete(umem->umem)) { if (xsk_umem__delete(umem->umem)) {
@ -366,9 +431,12 @@ xsk_configure_all(struct netdev *netdev)
/* Configure each queue. */ /* Configure each queue. */
for (i = 0; i < n_rxq; i++) { for (i = 0; i < n_rxq; i++) {
VLOG_INFO("%s: configure queue %d mode %s", __func__, i, VLOG_DBG("%s: configure queue %d mode %s use-need-wakeup %s.",
dev->xdpmode == XDP_COPY ? "SKB" : "DRV"); netdev_get_name(netdev), i,
xsk_info = xsk_configure(ifindex, i, dev->xdpmode); dev->xdpmode == XDP_COPY ? "SKB" : "DRV",
dev->use_need_wakeup ? "true" : "false");
xsk_info = xsk_configure(ifindex, i, dev->xdpmode,
dev->use_need_wakeup);
if (!xsk_info) { if (!xsk_info) {
VLOG_ERR("Failed to create AF_XDP socket on queue %d.", i); VLOG_ERR("Failed to create AF_XDP socket on queue %d.", i);
dev->xsks[i] = NULL; dev->xsks[i] = NULL;
@ -460,6 +528,7 @@ netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
struct netdev_linux *dev = netdev_linux_cast(netdev); struct netdev_linux *dev = netdev_linux_cast(netdev);
const char *str_xdpmode; const char *str_xdpmode;
int xdpmode, new_n_rxq; int xdpmode, new_n_rxq;
bool need_wakeup;
ovs_mutex_lock(&dev->mutex); ovs_mutex_lock(&dev->mutex);
new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1); new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
@ -482,10 +551,20 @@ netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
return EINVAL; return EINVAL;
} }
need_wakeup = smap_get_bool(args, "use-need-wakeup", NEED_WAKEUP_DEFAULT);
#ifndef HAVE_XDP_NEED_WAKEUP
if (need_wakeup) {
VLOG_WARN("XDP need_wakeup is not supported in libbpf.");
need_wakeup = false;
}
#endif
if (dev->requested_n_rxq != new_n_rxq if (dev->requested_n_rxq != new_n_rxq
|| dev->requested_xdpmode != xdpmode) { || dev->requested_xdpmode != xdpmode
|| dev->requested_need_wakeup != need_wakeup) {
dev->requested_n_rxq = new_n_rxq; dev->requested_n_rxq = new_n_rxq;
dev->requested_xdpmode = xdpmode; dev->requested_xdpmode = xdpmode;
dev->requested_need_wakeup = need_wakeup;
netdev_request_reconfigure(netdev); netdev_request_reconfigure(netdev);
} }
ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dev->mutex);
@ -500,7 +579,9 @@ netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args)
ovs_mutex_lock(&dev->mutex); ovs_mutex_lock(&dev->mutex);
smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
smap_add_format(args, "xdpmode", "%s", smap_add_format(args, "xdpmode", "%s",
dev->xdpmode == XDP_ZEROCOPY ? "drv" : "skb"); dev->xdpmode == XDP_ZEROCOPY ? "drv" : "skb");
smap_add_format(args, "use-need-wakeup", "%s",
dev->use_need_wakeup ? "true" : "false");
ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dev->mutex);
return 0; return 0;
} }
@ -516,6 +597,7 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
if (netdev->n_rxq == dev->requested_n_rxq if (netdev->n_rxq == dev->requested_n_rxq
&& dev->xdpmode == dev->requested_xdpmode && dev->xdpmode == dev->requested_xdpmode
&& dev->use_need_wakeup == dev->requested_need_wakeup
&& dev->xsks) { && dev->xsks) {
goto out; goto out;
} }
@ -532,6 +614,7 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
if (setrlimit(RLIMIT_MEMLOCK, &r)) { if (setrlimit(RLIMIT_MEMLOCK, &r)) {
VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s", ovs_strerror(errno)); VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s", ovs_strerror(errno));
} }
dev->use_need_wakeup = dev->requested_need_wakeup;
err = xsk_configure_all(netdev); err = xsk_configure_all(netdev);
if (err) { if (err) {
@ -654,6 +737,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx); rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
if (!rcvd) { if (!rcvd) {
xsk_rx_wakeup_if_needed(umem, netdev, rx->fd);
return EAGAIN; return EAGAIN;
} }
@ -698,11 +782,15 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
} }
static inline int static inline int
kick_tx(struct xsk_socket_info *xsk_info, int xdpmode) kick_tx(struct xsk_socket_info *xsk_info, int xdpmode, bool use_need_wakeup)
{ {
int ret, retries; int ret, retries;
static const int KERNEL_TX_BATCH_SIZE = 16; static const int KERNEL_TX_BATCH_SIZE = 16;
if (use_need_wakeup && !xsk_tx_need_wakeup(xsk_info)) {
return 0;
}
/* In SKB_MODE packet transmission is synchronous, and the kernel xmits /* In SKB_MODE packet transmission is synchronous, and the kernel xmits
* only TX_BATCH_SIZE(16) packets for a single sendmsg syscall. * only TX_BATCH_SIZE(16) packets for a single sendmsg syscall.
* So, we have to kick the kernel (n_packets / 16) times to be sure that * So, we have to kick the kernel (n_packets / 16) times to be sure that
@ -874,7 +962,7 @@ __netdev_afxdp_batch_send(struct netdev *netdev, int qid,
&orig); &orig);
COVERAGE_INC(afxdp_tx_full); COVERAGE_INC(afxdp_tx_full);
afxdp_complete_tx(xsk_info); afxdp_complete_tx(xsk_info);
kick_tx(xsk_info, dev->xdpmode); kick_tx(xsk_info, dev->xdpmode, dev->use_need_wakeup);
error = ENOMEM; error = ENOMEM;
goto out; goto out;
} }
@ -898,7 +986,7 @@ __netdev_afxdp_batch_send(struct netdev *netdev, int qid,
xsk_ring_prod__submit(&xsk_info->tx, dp_packet_batch_size(batch)); xsk_ring_prod__submit(&xsk_info->tx, dp_packet_batch_size(batch));
xsk_info->outstanding_tx += dp_packet_batch_size(batch); xsk_info->outstanding_tx += dp_packet_batch_size(batch);
ret = kick_tx(xsk_info, dev->xdpmode); ret = kick_tx(xsk_info, dev->xdpmode, dev->use_need_wakeup);
if (OVS_UNLIKELY(ret)) { if (OVS_UNLIKELY(ret)) {
VLOG_WARN_RL(&rl, "%s: error sending AF_XDP packet: %s.", VLOG_WARN_RL(&rl, "%s: error sending AF_XDP packet: %s.",
netdev_get_name(netdev), ovs_strerror(ret)); netdev_get_name(netdev), ovs_strerror(ret));
@ -968,6 +1056,7 @@ netdev_afxdp_construct(struct netdev *netdev)
dev->requested_n_rxq = NR_QUEUE; dev->requested_n_rxq = NR_QUEUE;
dev->requested_xdpmode = XDP_COPY; dev->requested_xdpmode = XDP_COPY;
dev->requested_need_wakeup = NEED_WAKEUP_DEFAULT;
dev->xsks = NULL; dev->xsks = NULL;
dev->tx_locks = NULL; dev->tx_locks = NULL;

View File

@ -102,6 +102,8 @@ struct netdev_linux {
int requested_n_rxq; int requested_n_rxq;
int xdpmode; /* AF_XDP running mode: driver or skb. */ int xdpmode; /* AF_XDP running mode: driver or skb. */
int requested_xdpmode; int requested_xdpmode;
bool use_need_wakeup;
bool requested_need_wakeup;
struct ovs_spin *tx_locks; /* spin lock array for TX queues. */ struct ovs_spin *tx_locks; /* spin lock array for TX queues. */
#endif #endif
}; };

View File

@ -3122,6 +3122,18 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \
</p> </p>
</column> </column>
<column name="options" key="use-need-wakeup"
type='{"type": "boolean"}'>
<p>
Specifies whether to use need_wakeup feature in afxdp netdev.
If enabled, OVS explicitly wakes up the kernel RX, using poll()
syscall and wakes up TX, using sendto() syscall. For physical
devices, this feature improves the performance by avoiding
unnecessary sendto syscalls.
Defaults to true if supported by libbpf.
</p>
</column>
<column name="options" key="vhost-server-path" <column name="options" key="vhost-server-path"
type='{"type": "string"}'> type='{"type": "string"}'>
<p> <p>