2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-31 14:25:26 +00:00

datapath: Report kernel's flow key when passing packets up to userspace.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.

This commit takes one step in that direction by making the kernel report
its idea of the flow that a packet belongs to whenever it passes a packet
up to userspace.  This means that userspace can intelligently figure out
what to do:

   - If userspace's notion of the flow for the packet matches the kernel's,
     then nothing special is necessary.

   - If the kernel has a more specific notion for the flow than userspace,
     for example if the kernel decoded IPv6 headers but userspace stopped
     at the Ethernet type (because it does not understand IPv6), then again
     nothing special is necessary: userspace can still set up the flow in
     the usual way.

   - If userspace has a more specific notion for the flow than the kernel,
     for example if userspace decoded an IPv6 header but the kernel
     stopped at the Ethernet type, then userspace can forward the packet
     manually, without setting up a flow in the kernel.  (This case is
     bad from a performance point of view, but at least it is correct.)

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, although userspace does now
have enough information to do that intelligently.  This will have to wait
for later commits.

This commit is bigger than it would otherwise be because it is rolled
together with changing "struct odp_msg" to a sequence of Netlink
attributes.  The alternative, to do each of those changes in a separate
patch, seemed like overkill because it meant that either we would have to
introduce and then kill off Netlink attributes for in_port and tun_id, if
Netlink conversion went first, or shove yet another variable-length header
into the stuff already after odp_msg, if adding the flow key to odp_msg
went first.

This commit will slow down performance of checksumming packets sent up to
userspace.  I'm not entirely pleased with how I did it.  I considered a
couple of alternatives, but none of them seemed that much better.
Suggestions welcome.  Not changing anything wasn't an option,
unfortunately.  At any rate some slowdown will become unavoidable when OVS
actually starts using Netlink instead of just Netlink framing.

(Actually, I thought of one option where we could avoid that: make
userspace do the checksum instead, by passing csum_start and csum_offset as
part of what goes to userspace.  But that's not perfect either.)

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
This commit is contained in:
Ben Pfaff
2011-01-24 14:59:57 -08:00
parent 36956a7d33
commit 856081f683
14 changed files with 470 additions and 488 deletions

View File

@@ -55,8 +55,6 @@
VLOG_DEFINE_THIS_MODULE(dpif_netdev);
/* Configuration parameters. */
enum { N_QUEUES = 2 }; /* Number of queues for dpif_recv(). */
enum { MAX_QUEUE_LEN = 100 }; /* Maximum number of packets per queue. */
enum { MAX_PORTS = 256 }; /* Maximum number of ports. */
enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
@@ -64,6 +62,17 @@ enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
* headers to be aligned on a 4-byte boundary. */
enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };
/* Queues. */
enum { N_QUEUES = 2 }; /* Number of queues for dpif_recv(). */
enum { MAX_QUEUE_LEN = 128 }; /* Maximum number of packets per queue. */
enum { QUEUE_MASK = MAX_QUEUE_LEN - 1 };
BUILD_ASSERT_DECL(IS_POW2(MAX_QUEUE_LEN));
struct dp_netdev_queue {
struct dpif_upcall *upcalls[MAX_QUEUE_LEN];
unsigned int head, tail;
};
/* Datapath based on the network device interface from netdev.h. */
struct dp_netdev {
const struct dpif_class *class;
@@ -72,8 +81,7 @@ struct dp_netdev {
bool destroyed;
bool drop_frags; /* Drop all IP fragments, if true. */
struct list queues[N_QUEUES]; /* Contain ofpbufs queued for dpif_recv(). */
size_t queue_len[N_QUEUES]; /* Number of packets in each queue. */
struct dp_netdev_queue queues[N_QUEUES];
struct hmap flow_table; /* Flow table. */
/* Statistics. */
@@ -139,7 +147,8 @@ static int do_del_port(struct dp_netdev *, uint16_t port_no);
static int dpif_netdev_open(const struct dpif_class *, const char *name,
bool create, struct dpif **);
static int dp_netdev_output_control(struct dp_netdev *, const struct ofpbuf *,
int queue_no, int port_no, uint64_t arg);
int queue_no, const struct flow *,
uint64_t arg);
static int dp_netdev_execute_actions(struct dp_netdev *,
struct ofpbuf *, struct flow *,
const struct nlattr *actions,
@@ -191,7 +200,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
dp->open_cnt = 0;
dp->drop_frags = false;
for (i = 0; i < N_QUEUES; i++) {
list_init(&dp->queues[i]);
dp->queues[i].head = dp->queues[i].tail = 0;
}
hmap_init(&dp->flow_table);
list_init(&dp->port_list);
@@ -248,7 +257,15 @@ dp_netdev_free(struct dp_netdev *dp)
do_del_port(dp, port->port_no);
}
for (i = 0; i < N_QUEUES; i++) {
ofpbuf_list_delete(&dp->queues[i]);
struct dp_netdev_queue *q = &dp->queues[i];
unsigned int j;
for (j = q->tail; j != q->head; j++) {
struct dpif_upcall *upcall = q->upcalls[j & QUEUE_MASK];
ofpbuf_delete(upcall->packet);
free(upcall);
}
}
hmap_destroy(&dp->flow_table);
free(dp->name);
@@ -931,7 +948,7 @@ dpif_netdev_recv_set_mask(struct dpif *dpif, int listen_mask)
}
}
static int
static struct dp_netdev_queue *
find_nonempty_queue(struct dpif *dpif)
{
struct dpif_netdev *dpif_netdev = dpif_netdev_cast(dpif);
@@ -940,23 +957,22 @@ find_nonempty_queue(struct dpif *dpif)
int i;
for (i = 0; i < N_QUEUES; i++) {
struct list *queue = &dp->queues[i];
if (!list_is_empty(queue) && mask & (1u << i)) {
return i;
struct dp_netdev_queue *q = &dp->queues[i];
if (q->head != q->tail && mask & (1u << i)) {
return q;
}
}
return -1;
return NULL;
}
static int
dpif_netdev_recv(struct dpif *dpif, struct ofpbuf **bufp)
dpif_netdev_recv(struct dpif *dpif, struct dpif_upcall *upcall)
{
int queue_idx = find_nonempty_queue(dpif);
if (queue_idx >= 0) {
struct dp_netdev *dp = get_dp_netdev(dpif);
*bufp = ofpbuf_from_list(list_pop_front(&dp->queues[queue_idx]));
dp->queue_len[queue_idx]--;
struct dp_netdev_queue *q = find_nonempty_queue(dpif);
if (q) {
struct dpif_upcall *u = q->upcalls[q->tail++ & QUEUE_MASK];
*upcall = *u;
free(u);
return 0;
} else {
@@ -967,7 +983,7 @@ dpif_netdev_recv(struct dpif *dpif, struct ofpbuf **bufp)
static void
dpif_netdev_recv_wait(struct dpif *dpif)
{
if (find_nonempty_queue(dpif) >= 0) {
if (find_nonempty_queue(dpif)) {
poll_immediate_wake();
} else {
/* No messages ready to be received, and dp_wait() will ensure that we
@@ -1011,7 +1027,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port,
dp->n_hit++;
} else {
dp->n_missed++;
dp_netdev_output_control(dp, packet, _ODPL_MISS_NR, port->port_no, 0);
dp_netdev_output_control(dp, packet, _ODPL_MISS_NR, &key, 0);
}
}
@@ -1212,27 +1228,33 @@ dp_netdev_output_port(struct dp_netdev *dp, struct ofpbuf *packet,
static int
dp_netdev_output_control(struct dp_netdev *dp, const struct ofpbuf *packet,
int queue_no, int port_no, uint64_t arg)
int queue_no, const struct flow *flow, uint64_t arg)
{
struct odp_msg *header;
struct ofpbuf *msg;
size_t msg_size;
struct dp_netdev_queue *q = &dp->queues[queue_no];
struct dpif_upcall *upcall;
struct ofpbuf *buf;
size_t key_len;
if (dp->queue_len[queue_no] >= MAX_QUEUE_LEN) {
if (q->head - q->tail >= MAX_QUEUE_LEN) {
dp->n_lost++;
return ENOBUFS;
}
msg_size = sizeof *header + packet->size;
msg = ofpbuf_new_with_headroom(msg_size, DPIF_RECV_MSG_PADDING);
header = ofpbuf_put_uninit(msg, sizeof *header);
header->type = queue_no;
header->length = msg_size;
header->port = port_no;
header->arg = arg;
ofpbuf_put(msg, packet->data, packet->size);
list_push_back(&dp->queues[queue_no], &msg->list_node);
dp->queue_len[queue_no]++;
buf = ofpbuf_new(ODPUTIL_FLOW_KEY_BYTES + 2 + packet->size);
odp_flow_key_from_flow(buf, flow);
key_len = buf->size;
ofpbuf_pull(buf, key_len);
ofpbuf_reserve(buf, 2);
ofpbuf_put(buf, packet->data, packet->size);
upcall = xzalloc(sizeof *upcall);
upcall->type = queue_no;
upcall->packet = buf;
upcall->key = buf->base;
upcall->key_len = key_len;
upcall->userdata = arg;
q->upcalls[++q->head & QUEUE_MASK] = upcall;
return 0;
}
@@ -1282,7 +1304,7 @@ dp_netdev_execute_actions(struct dp_netdev *dp,
case ODPAT_CONTROLLER:
dp_netdev_output_control(dp, packet, _ODPL_ACTION_NR,
key->in_port, nl_attr_get_u64(a));
key, nl_attr_get_u64(a));
break;
case ODPAT_SET_DL_TCI: