dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
2025-08-31 14:25:26 +00:00 · 2016-07-27 17:44:41 +03:00
parent 7adc92ac59
commit 324c837485
8 changed files with 200 additions and 74 deletions
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -298,7 +298,7 @@ struct dpdk_tx_queue {
    rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
                                    * from concurrent access.  It is used only
                                    * if the queue is shared among different
-                                    * pmd threads (see 'txq_needs_locking'). */
+                                    * pmd threads (see 'concurrent_txq'). */
    int map;                       /* Mapping of configured vhost-user queues
                                    * to enabled by guest. */
 };
@@ -349,13 +349,6 @@ struct netdev_dpdk {
    struct rte_eth_link link;
    int link_reset_cnt;

-    /* Caller of netdev_send() might want to use more txqs than the device has.
-     * For physical NICs, if the 'requested_n_txq' less or equal to 'up.n_txq',
-     * 'txq_needs_locking' is false, otherwise it is true and we will take a
-     * spinlock on transmission.  For vhost devices, 'requested_n_txq' is
-     * always true.  */
-    bool txq_needs_locking;
-
    /* virtio-net structure for vhost device */
    OVSRCU_TYPE(struct virtio_net *) virtio_dev;

@@ -778,10 +771,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
            goto unlock;
        }
        netdev_dpdk_alloc_txq(dev, netdev->n_txq);
-        dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
    } else {
        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
-        dev->txq_needs_locking = true;
        /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
        dev->flags = NETDEV_UP | NETDEV_PROMISC;
    }
@@ -1468,7 +1459,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
 static int
 netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
                       struct dp_packet_batch *batch,
-                       bool may_steal)
+                       bool may_steal, bool concurrent_txq OVS_UNUSED)
 {

    if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
@@ -1484,9 +1475,10 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid,

 static inline void
 netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
-                   struct dp_packet_batch *batch, bool may_steal)
+                   struct dp_packet_batch *batch, bool may_steal,
+                   bool concurrent_txq)
 {
-    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+    if (OVS_UNLIKELY(concurrent_txq)) {
        qid = qid % dev->up.n_txq;
        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
    }
@@ -1551,18 +1543,19 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
        }
    }

-    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+    if (OVS_UNLIKELY(concurrent_txq)) {
        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
    }
 }

 static int
 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
-                     struct dp_packet_batch *batch, bool may_steal)
+                     struct dp_packet_batch *batch, bool may_steal,
+                     bool concurrent_txq)
 {
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

-    netdev_dpdk_send__(dev, qid, batch, may_steal);
+    netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
    return 0;
 }

@@ -2533,7 +2526,8 @@ dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id)

 static int
 netdev_dpdk_ring_send(struct netdev *netdev, int qid,
-                      struct dp_packet_batch *batch, bool may_steal)
+                      struct dp_packet_batch *batch, bool may_steal,
+                      bool concurrent_txq)
 {
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    unsigned i;
@@ -2546,7 +2540,7 @@ netdev_dpdk_ring_send(struct netdev *netdev, int qid,
        dp_packet_rss_invalidate(batch->packets[i]);
    }

-    netdev_dpdk_send__(dev, qid, batch, may_steal);
+    netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
    return 0;
 }

@@ -2823,8 +2817,6 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
    err = dpdk_eth_dev_init(dev);
    netdev_dpdk_alloc_txq(dev, netdev->n_txq);

-    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
-
 out:

    ovs_mutex_unlock(&dev->mutex);