diff --git a/INSTALL.DPDK-ADVANCED.md b/INSTALL.DPDK-ADVANCED.md index 11b603aa9..77e202dba 100755 --- a/INSTALL.DPDK-ADVANCED.md +++ b/INSTALL.DPDK-ADVANCED.md @@ -1,5 +1,5 @@ OVS DPDK ADVANCED INSTALL GUIDE -================================= +=============================== ## Contents @@ -13,7 +13,8 @@ OVS DPDK ADVANCED INSTALL GUIDE 8. [Rate Limiting](#rl) 9. [Flow Control](#fc) 10. [Pdump](#pdump) -11. [Vsperf](#vsperf) +11. [Jumbo Frames](#jumbo) +12. [Vsperf](#vsperf) ## 1. Overview @@ -981,7 +982,58 @@ tcpdump. Issue the following command to view the contents of 'pkts.pcap': A performance decrease is expected when using a monitoring application like the DPDK pdump app. -## 11. Vsperf +## 11. Jumbo Frames + +By default, DPDK ports are configured with standard Ethernet MTU (1500B). To +enable Jumbo Frames support for a DPDK port, change the Interface's `mtu_request` +attribute to a sufficiently large value. + +e.g. Add a DPDK Phy port with MTU of 9000: + +`ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk -- set Interface dpdk0 mtu_request=9000` + +e.g. Change the MTU of an existing port to 6200: + +`ovs-vsctl set Interface dpdk0 mtu_request=6200` + +When Jumbo Frames are enabled, the size of a DPDK port's mbuf segments are +increased, such that a full Jumbo Frame of a specific size may be accommodated +within a single mbuf segment. + +Jumbo frame support has been validated against 9728B frames (largest frame size +supported by Fortville NIC), using the DPDK `i40e` driver, but larger frames +(particularly in use cases involving East-West traffic only), and other DPDK NIC +drivers may be supported. + +### 9.1 vHost Ports and Jumbo Frames + +Some additional configuration is needed to take advantage of jumbo frames with +vhost ports: + + 1. `mergeable buffers` must be enabled for vHost ports, as demonstrated in + the QEMU command line snippet below: + + ``` + '-netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce \' + '-device virtio-net-pci,mac=00:00:00:00:00:01,netdev=mynet1,mrg_rxbuf=on' + ``` + + 2. Where virtio devices are bound to the Linux kernel driver in a guest + environment (i.e. interfaces are not bound to an in-guest DPDK driver), + the MTU of those logical network interfaces must also be increased to a + sufficiently large value. This avoids segmentation of Jumbo Frames + received in the guest. Note that 'MTU' refers to the length of the IP + packet only, and not that of the entire frame. + + To calculate the exact MTU of a standard IPv4 frame, subtract the L2 + header and CRC lengths (i.e. 18B) from the max supported frame size. + So, to set the MTU for a 9018B Jumbo Frame: + + ``` + ifconfig eth1 mtu 9000 + ``` + +## 12. Vsperf Vsperf project goal is to develop vSwitch test framework that can be used to validate the suitability of different vSwitch implementations in a Telco deployment diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md index c0686cef7..30e925887 100644 --- a/INSTALL.DPDK.md +++ b/INSTALL.DPDK.md @@ -590,7 +590,6 @@ can be found in [Vhost Walkthrough]. ## 6. Limitations - - Supports MTU size 1500, MTU setting for DPDK netdevs will be in future OVS release. - Currently DPDK ports does not use HW offload functionality. - Network Interface Firmware requirements: Each release of DPDK is validated against a specific firmware version for diff --git a/NEWS b/NEWS index 63f147955..2fd395847 100644 --- a/NEWS +++ b/NEWS @@ -70,6 +70,7 @@ Post-v2.5.0 fragmentation or NAT support yet) * Support for DPDK 16.07 * Optional support for DPDK pdump enabled. + * Jumbo frame support - Increase number of registers to 16. - ovs-benchmark: This utility has been removed due to lack of use and bitrot. diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index ca4d50962..c767fd438 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -84,6 +84,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + sizeof(struct dp_packet) \ + RTE_PKTMBUF_HEADROOM) #define NETDEV_DPDK_MBUF_ALIGN 1024 +#define NETDEV_DPDK_MAX_PKT_LEN 9728 /* Max and min number of packets in the mempool. OVS tries to allocate a * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have @@ -373,6 +374,7 @@ struct netdev_dpdk { /* The following properties cannot be changed when a device is running, * so we remember the request and update them next time * netdev_dpdk*_reconfigure() is called */ + int requested_mtu; int requested_n_txq; int requested_n_rxq; @@ -482,10 +484,19 @@ dpdk_mp_get(int socket_id, int mtu) OVS_REQUIRES(dpdk_mutex) dmp->mtu = mtu; dmp->refcount = 1; mbp_priv.mbuf_data_room_size = MBUF_SIZE(mtu) - sizeof(struct dp_packet); - mbp_priv.mbuf_priv_size = sizeof (struct dp_packet) - - sizeof (struct rte_mbuf); + mbp_priv.mbuf_priv_size = sizeof (struct dp_packet) + - sizeof (struct rte_mbuf); + /* XXX: this is a really rough method of provisioning memory. + * It's impossible to determine what the exact memory requirements are when + * the number of ports and rxqs that utilize a particular mempool can change + * dynamically at runtime. For the moment, use this rough heurisitic. + */ + if (mtu >= ETHER_MTU) { + mp_size = MAX_NB_MBUF; + } else { + mp_size = MIN_NB_MBUF; + } - mp_size = MAX_NB_MBUF; do { if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d_%u", dmp->mtu, dmp->socket_id, mp_size) < 0) { @@ -525,6 +536,35 @@ dpdk_mp_put(struct dpdk_mp *dmp) OVS_REQUIRES(dpdk_mutex) } } +/* Tries to allocate new mempool on requested_socket_id with + * mbuf size corresponding to requested_mtu. + * On success new configuration will be applied. + * On error, device will be left unchanged. */ +static int +netdev_dpdk_mempool_configure(struct netdev_dpdk *dev) + OVS_REQUIRES(dpdk_mutex) + OVS_REQUIRES(dev->mutex) +{ + uint32_t buf_size = dpdk_buf_size(dev->requested_mtu); + struct dpdk_mp *mp; + + mp = dpdk_mp_get(dev->requested_socket_id, FRAME_LEN_TO_MTU(buf_size)); + if (!mp) { + VLOG_ERR("Insufficient memory to create memory pool for netdev " + "%s, with MTU %d on socket %d\n", + dev->up.name, dev->requested_mtu, dev->requested_socket_id); + return ENOMEM; + } else { + dpdk_mp_put(dev->dpdk_mp); + dev->dpdk_mp = mp; + dev->mtu = dev->requested_mtu; + dev->socket_id = dev->requested_socket_id; + dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu); + } + + return 0; +} + static void check_link_status(struct netdev_dpdk *dev) { @@ -576,7 +616,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) { int diag = 0; int i; + struct rte_eth_conf conf = port_conf; + if (dev->mtu > ETHER_MTU) { + conf.rxmode.jumbo_frame = 1; + conf.rxmode.max_rx_pkt_len = dev->max_packet_len; + } else { + conf.rxmode.jumbo_frame = 0; + conf.rxmode.max_rx_pkt_len = 0; + } /* A device may report more queues than it makes available (this has * been observed for Intel xl710, which reserves some of them for * SRIOV): rte_eth_*_queue_setup will fail if a queue is not @@ -587,8 +635,10 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq); } - diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf); + diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf); if (diag) { + VLOG_WARN("Interface %s eth_dev setup error %s\n", + dev->up.name, rte_strerror(-diag)); break; } @@ -741,7 +791,6 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no, struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); int sid; int err = 0; - uint32_t buf_size; ovs_mutex_init(&dev->mutex); ovs_mutex_lock(&dev->mutex); @@ -762,15 +811,13 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no, dev->port_id = port_no; dev->type = type; dev->flags = 0; - dev->mtu = ETHER_MTU; + dev->requested_mtu = dev->mtu = ETHER_MTU; dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu); ovsrcu_index_init(&dev->vid, -1); dev->vhost_reconfigured = false; - buf_size = dpdk_buf_size(dev->mtu); - dev->dpdk_mp = dpdk_mp_get(dev->socket_id, FRAME_LEN_TO_MTU(buf_size)); - if (!dev->dpdk_mp) { - err = ENOMEM; + err = netdev_dpdk_mempool_configure(dev); + if (err) { goto unlock; } @@ -1008,6 +1055,7 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq); smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq); smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq); + smap_add_format(args, "mtu", "%d", dev->mtu); ovs_mutex_unlock(&dev->mutex); return 0; @@ -1382,6 +1430,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts; unsigned int total_pkts = cnt; unsigned int qos_pkts = 0; + unsigned int mtu_dropped = 0; int i, retries = 0; qid = dev->tx_q[qid % netdev->n_txq].map; @@ -1403,25 +1452,41 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, do { int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; unsigned int tx_pkts; + unsigned int try_tx_pkts = cnt; + for (i = 0; i < cnt; i++) { + if (cur_pkts[i]->pkt_len > dev->max_packet_len) { + try_tx_pkts = i; + break; + } + } + if (!try_tx_pkts) { + cur_pkts++; + mtu_dropped++; + cnt--; + continue; + } tx_pkts = rte_vhost_enqueue_burst(netdev_dpdk_get_vid(dev), - vhost_qid, cur_pkts, cnt); + vhost_qid, cur_pkts, try_tx_pkts); if (OVS_LIKELY(tx_pkts)) { /* Packets have been sent.*/ cnt -= tx_pkts; /* Prepare for possible retry.*/ cur_pkts = &cur_pkts[tx_pkts]; + if (tx_pkts != try_tx_pkts) { + retries++; + } } else { /* No packets sent - do not retry.*/ break; } - } while (cnt && (retries++ < VHOST_ENQ_RETRY_NUM)); + } while (cnt && (retries <= VHOST_ENQ_RETRY_NUM)); rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); rte_spinlock_lock(&dev->stats_lock); - cnt += qos_pkts; - netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt); + netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, + cnt + mtu_dropped + qos_pkts); rte_spinlock_unlock(&dev->stats_lock); out: @@ -1649,6 +1714,27 @@ netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup) return 0; } +static int +netdev_dpdk_set_mtu(struct netdev *netdev, int mtu) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + + if (MTU_TO_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN + || mtu < ETHER_MIN_MTU) { + VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu); + return EINVAL; + } + + ovs_mutex_lock(&dev->mutex); + if (dev->requested_mtu != mtu) { + dev->requested_mtu = mtu; + netdev_request_reconfigure(netdev); + } + ovs_mutex_unlock(&dev->mutex); + + return 0; +} + static int netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier); @@ -2834,7 +2920,8 @@ netdev_dpdk_reconfigure(struct netdev *netdev) ovs_mutex_lock(&dev->mutex); if (netdev->n_txq == dev->requested_n_txq - && netdev->n_rxq == dev->requested_n_rxq) { + && netdev->n_rxq == dev->requested_n_rxq + && dev->mtu == dev->requested_mtu) { /* Reconfiguration is unnecessary */ goto out; @@ -2842,6 +2929,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev) rte_eth_dev_stop(dev->port_id); + if (dev->mtu != dev->requested_mtu) { + netdev_dpdk_mempool_configure(dev); + } + netdev->n_txq = dev->requested_n_txq; netdev->n_rxq = dev->requested_n_rxq; @@ -2849,6 +2940,8 @@ netdev_dpdk_reconfigure(struct netdev *netdev) err = dpdk_eth_dev_init(dev); netdev_dpdk_alloc_txq(dev, netdev->n_txq); + netdev_change_seq_changed(netdev); + out: ovs_mutex_unlock(&dev->mutex); @@ -2861,7 +2954,6 @@ static int netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); - int err = 0; ovs_mutex_lock(&dpdk_mutex); ovs_mutex_lock(&dev->mutex); @@ -2876,13 +2968,10 @@ netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev) netdev_dpdk_remap_txqs(dev); - if (dev->requested_socket_id != dev->socket_id) { - dev->socket_id = dev->requested_socket_id; - /* Change mempool to new NUMA Node */ - dpdk_mp_put(dev->dpdk_mp); - dev->dpdk_mp = dpdk_mp_get(dev->socket_id, dev->mtu); - if (!dev->dpdk_mp) { - err = ENOMEM; + if (dev->requested_socket_id != dev->socket_id + || dev->requested_mtu != dev->mtu) { + if (!netdev_dpdk_mempool_configure(dev)) { + netdev_change_seq_changed(netdev); } } @@ -2893,7 +2982,7 @@ netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev) ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dpdk_mutex); - return err; + return 0; } static int @@ -2907,6 +2996,12 @@ netdev_dpdk_vhost_cuse_reconfigure(struct netdev *netdev) netdev->n_txq = dev->requested_n_txq; netdev->n_rxq = 1; + if (dev->requested_mtu != dev->mtu) { + if (!netdev_dpdk_mempool_configure(dev)) { + netdev_change_seq_changed(netdev); + } + } + ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dpdk_mutex); @@ -2944,7 +3039,7 @@ netdev_dpdk_vhost_cuse_reconfigure(struct netdev *netdev) netdev_dpdk_set_etheraddr, \ netdev_dpdk_get_etheraddr, \ netdev_dpdk_get_mtu, \ - NULL, /* set_mtu */ \ + netdev_dpdk_set_mtu, \ netdev_dpdk_get_ifindex, \ GET_CARRIER, \ netdev_dpdk_get_carrier_resets, \