datapath: Report kernel's flow key when passing packets up to userspace.

One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. This commit takes one step in that direction by making the kernel report its idea of the flow that a packet belongs to whenever it passes a packet up to userspace. This means that userspace can intelligently figure out what to do: - If userspace's notion of the flow for the packet matches the kernel's, then nothing special is necessary. - If the kernel has a more specific notion for the flow than userspace, for example if the kernel decoded IPv6 headers but userspace stopped at the Ethernet type (because it does not understand IPv6), then again nothing special is necessary: userspace can still set up the flow in the usual way. - If userspace has a more specific notion for the flow than the kernel, for example if userspace decoded an IPv6 header but the kernel stopped at the Ethernet type, then userspace can forward the packet manually, without setting up a flow in the kernel. (This case is bad from a performance point of view, but at least it is correct.) This commit does not actually make userspace flexible enough to handle changes in the kernel flow key structure, although userspace does now have enough information to do that intelligently. This will have to wait for later commits. This commit is bigger than it would otherwise be because it is rolled together with changing "struct odp_msg" to a sequence of Netlink attributes. The alternative, to do each of those changes in a separate patch, seemed like overkill because it meant that either we would have to introduce and then kill off Netlink attributes for in_port and tun_id, if Netlink conversion went first, or shove yet another variable-length header into the stuff already after odp_msg, if adding the flow key to odp_msg went first. This commit will slow down performance of checksumming packets sent up to userspace. I'm not entirely pleased with how I did it. I considered a couple of alternatives, but none of them seemed that much better. Suggestions welcome. Not changing anything wasn't an option, unfortunately. At any rate some slowdown will become unavoidable when OVS actually starts using Netlink instead of just Netlink framing. (Actually, I thought of one option where we could avoid that: make userspace do the checksum instead, by passing csum_start and csum_offset as part of what goes to userspace. But that's not perfect either.) Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2025-08-31 14:25:26 +00:00 · 2011-01-24 14:59:57 -08:00
parent 36956a7d33
commit 856081f683
14 changed files with 470 additions and 488 deletions
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -55,8 +55,6 @@
 VLOG_DEFINE_THIS_MODULE(dpif_netdev);

 /* Configuration parameters. */
-enum { N_QUEUES = 2 };          /* Number of queues for dpif_recv(). */
-enum { MAX_QUEUE_LEN = 100 };   /* Maximum number of packets per queue. */
 enum { MAX_PORTS = 256 };       /* Maximum number of ports. */
 enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */

@@ -64,6 +62,17 @@ enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
 * headers to be aligned on a 4-byte boundary.  */
 enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };

+/* Queues. */
+enum { N_QUEUES = 2 };          /* Number of queues for dpif_recv(). */
+enum { MAX_QUEUE_LEN = 128 };   /* Maximum number of packets per queue. */
+enum { QUEUE_MASK = MAX_QUEUE_LEN - 1 };
+BUILD_ASSERT_DECL(IS_POW2(MAX_QUEUE_LEN));
+
+struct dp_netdev_queue {
+    struct dpif_upcall *upcalls[MAX_QUEUE_LEN];
+    unsigned int head, tail;
+};
+
 /* Datapath based on the network device interface from netdev.h. */
 struct dp_netdev {
    const struct dpif_class *class;
@@ -72,8 +81,7 @@ struct dp_netdev {
    bool destroyed;

    bool drop_frags;            /* Drop all IP fragments, if true. */
-    struct list queues[N_QUEUES]; /* Contain ofpbufs queued for dpif_recv(). */
-    size_t queue_len[N_QUEUES]; /* Number of packets in each queue. */
+    struct dp_netdev_queue queues[N_QUEUES];
    struct hmap flow_table;     /* Flow table. */

    /* Statistics. */
@@ -139,7 +147,8 @@ static int do_del_port(struct dp_netdev *, uint16_t port_no);
 static int dpif_netdev_open(const struct dpif_class *, const char *name,
                            bool create, struct dpif **);
 static int dp_netdev_output_control(struct dp_netdev *, const struct ofpbuf *,
-                                    int queue_no, int port_no, uint64_t arg);
+                                    int queue_no, const struct flow *,
+                                    uint64_t arg);
 static int dp_netdev_execute_actions(struct dp_netdev *,
                                     struct ofpbuf *, struct flow *,
                                     const struct nlattr *actions,
@@ -191,7 +200,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
    dp->open_cnt = 0;
    dp->drop_frags = false;
    for (i = 0; i < N_QUEUES; i++) {
-        list_init(&dp->queues[i]);
+        dp->queues[i].head = dp->queues[i].tail = 0;
    }
    hmap_init(&dp->flow_table);
    list_init(&dp->port_list);
@@ -248,7 +257,15 @@ dp_netdev_free(struct dp_netdev *dp)
        do_del_port(dp, port->port_no);
    }
    for (i = 0; i < N_QUEUES; i++) {
-        ofpbuf_list_delete(&dp->queues[i]);
+        struct dp_netdev_queue *q = &dp->queues[i];
+        unsigned int j;
+
+        for (j = q->tail; j != q->head; j++) {
+            struct dpif_upcall *upcall = q->upcalls[j & QUEUE_MASK];
+
+            ofpbuf_delete(upcall->packet);
+            free(upcall);
+        }
    }
    hmap_destroy(&dp->flow_table);
    free(dp->name);
@@ -931,7 +948,7 @@ dpif_netdev_recv_set_mask(struct dpif *dpif, int listen_mask)
    }
 }

-static int
+static struct dp_netdev_queue *
 find_nonempty_queue(struct dpif *dpif)
 {
    struct dpif_netdev *dpif_netdev = dpif_netdev_cast(dpif);
@@ -940,23 +957,22 @@ find_nonempty_queue(struct dpif *dpif)
    int i;

    for (i = 0; i < N_QUEUES; i++) {
-        struct list *queue = &dp->queues[i];
-        if (!list_is_empty(queue) && mask & (1u << i)) {
-            return i;
+        struct dp_netdev_queue *q = &dp->queues[i];
+        if (q->head != q->tail && mask & (1u << i)) {
+            return q;
        }
    }
-    return -1;
+    return NULL;
 }

 static int
-dpif_netdev_recv(struct dpif *dpif, struct ofpbuf **bufp)
+dpif_netdev_recv(struct dpif *dpif, struct dpif_upcall *upcall)
 {
-    int queue_idx = find_nonempty_queue(dpif);
-    if (queue_idx >= 0) {
-        struct dp_netdev *dp = get_dp_netdev(dpif);
-
-        *bufp = ofpbuf_from_list(list_pop_front(&dp->queues[queue_idx]));
-        dp->queue_len[queue_idx]--;
+    struct dp_netdev_queue *q = find_nonempty_queue(dpif);
+    if (q) {
+        struct dpif_upcall *u = q->upcalls[q->tail++ & QUEUE_MASK];
+        *upcall = *u;
+        free(u);

        return 0;
    } else {
@@ -967,7 +983,7 @@ dpif_netdev_recv(struct dpif *dpif, struct ofpbuf **bufp)
 static void
 dpif_netdev_recv_wait(struct dpif *dpif)
 {
-    if (find_nonempty_queue(dpif) >= 0) {
+    if (find_nonempty_queue(dpif)) {
        poll_immediate_wake();
    } else {
        /* No messages ready to be received, and dp_wait() will ensure that we
@@ -1011,7 +1027,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port,
        dp->n_hit++;
    } else {
        dp->n_missed++;
-        dp_netdev_output_control(dp, packet, _ODPL_MISS_NR, port->port_no, 0);
+        dp_netdev_output_control(dp, packet, _ODPL_MISS_NR, &key, 0);
    }
 }

@@ -1212,27 +1228,33 @@ dp_netdev_output_port(struct dp_netdev *dp, struct ofpbuf *packet,

 static int
 dp_netdev_output_control(struct dp_netdev *dp, const struct ofpbuf *packet,
-                         int queue_no, int port_no, uint64_t arg)
+                         int queue_no, const struct flow *flow, uint64_t arg)
 {
-    struct odp_msg *header;
-    struct ofpbuf *msg;
-    size_t msg_size;
+    struct dp_netdev_queue *q = &dp->queues[queue_no];
+    struct dpif_upcall *upcall;
+    struct ofpbuf *buf;
+    size_t key_len;

-    if (dp->queue_len[queue_no] >= MAX_QUEUE_LEN) {
+    if (q->head - q->tail >= MAX_QUEUE_LEN) {
        dp->n_lost++;
        return ENOBUFS;
    }

-    msg_size = sizeof *header + packet->size;
-    msg = ofpbuf_new_with_headroom(msg_size, DPIF_RECV_MSG_PADDING);
-    header = ofpbuf_put_uninit(msg, sizeof *header);
-    header->type = queue_no;
-    header->length = msg_size;
-    header->port = port_no;
-    header->arg = arg;
-    ofpbuf_put(msg, packet->data, packet->size);
-    list_push_back(&dp->queues[queue_no], &msg->list_node);
-    dp->queue_len[queue_no]++;
+    buf = ofpbuf_new(ODPUTIL_FLOW_KEY_BYTES + 2 + packet->size);
+    odp_flow_key_from_flow(buf, flow);
+    key_len = buf->size;
+    ofpbuf_pull(buf, key_len);
+    ofpbuf_reserve(buf, 2);
+    ofpbuf_put(buf, packet->data, packet->size);
+
+    upcall = xzalloc(sizeof *upcall);
+    upcall->type = queue_no;
+    upcall->packet = buf;
+    upcall->key = buf->base;
+    upcall->key_len = key_len;
+    upcall->userdata = arg;
+
+    q->upcalls[++q->head & QUEUE_MASK] = upcall;

    return 0;
 }
@@ -1282,7 +1304,7 @@ dp_netdev_execute_actions(struct dp_netdev *dp,

        case ODPAT_CONTROLLER:
            dp_netdev_output_control(dp, packet, _ODPL_ACTION_NR,
-                                     key->in_port, nl_attr_get_u64(a));
+                                     key, nl_attr_get_u64(a));
            break;

        case ODPAT_SET_DL_TCI: