2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-30 22:05:19 +00:00

dpif-netlink: Add support for packet receive on Windows.

In this patch, we add support in dpif-netlink.c to receive packets on
Windows. Windows does not natively support epoll(). Even though there
are mechanisms/interfaces that provide functionality similar to epoll(),
we take a simple approach of using a pool of sockets.

Here are some details of the implementaion to aid review:
1. There's pool of sockets per upcall handler.
2. The pool of sockets is initialized while setting up the handler in
dpif_netlink_refresh_channels() primarily.
3. When sockets are to be allocated for a vport, we walk through the
pool of sockets for all handlers and pick one of the sockets in each of
the pool. Within a handler's pool, sockets are picked in a round-robin
fashion.
4. We currently support only 1 handler, since there are some kernel
changes needed for support more than 1 handler per vport.
5. The pool size is also set to 1 currently.

The restructions imposed by #4 and #5 can be removed in the future
without much code churn.

Validation:
1. With a hacked up kernel which figures out the netlink socket that is
designated to receive packets, we are cable to perform pings between 2
VMs on the same Hyper-V host.
2. Compiled the code in Linux as well.
3. Tested with pool size == 2 as well, though in this patch we set the
pool size = 1.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
This commit is contained in:
Nithin Raju
2014-10-23 08:27:34 -07:00
committed by Ben Pfaff
parent 36791e2178
commit 09cac43f74

View File

@@ -56,6 +56,11 @@
#include "vlog.h"
VLOG_DEFINE_THIS_MODULE(dpif_netlink);
#ifdef _WIN32
enum { WINDOWS = 1 };
#else
enum { WINDOWS = 0 };
#endif
enum { MAX_PORTS = USHRT_MAX };
/* This ethtool flag was introduced in Linux 2.6.24, so it might be
@@ -137,12 +142,32 @@ struct dpif_channel {
long long int last_poll; /* Last time this channel was polled. */
};
#ifdef _WIN32
#define VPORT_SOCK_POOL_SIZE 1
/* On Windows, there is no native support for epoll. There are equivalent
* interfaces though, that are not used currently. For simpicity, a pool of
* netlink sockets is used. Each socket is represented by 'struct
* dpif_windows_vport_sock'. Since it is a pool, multiple OVS ports may be
* sharing the same socket. In the future, we can add a reference count and
* such fields. */
struct dpif_windows_vport_sock {
struct nl_sock *nl_sock; /* netlink socket. */
};
#endif
struct dpif_handler {
struct dpif_channel *channels;/* Array of channels for each handler. */
struct epoll_event *epoll_events;
int epoll_fd; /* epoll fd that includes channel socks. */
int n_events; /* Num events returned by epoll_wait(). */
int event_offset; /* Offset into 'epoll_events'. */
#ifdef _WIN32
/* Pool of sockets. */
struct dpif_windows_vport_sock *vport_sock_pool;
size_t last_used_pool_idx; /* Index to aid in allocating a
socket in the pool to a port. */
#endif
};
/* Datapath interface for the openvswitch Linux kernel module. */
@@ -184,6 +209,7 @@ static int dpif_netlink_init(void);
static int open_dpif(const struct dpif_netlink_dp *, struct dpif **);
static uint32_t dpif_netlink_port_get_pid(const struct dpif *,
odp_port_t port_no, uint32_t hash);
static void dpif_netlink_handler_uninit(struct dpif_handler *handler);
static int dpif_netlink_refresh_channels(struct dpif_netlink *,
uint32_t n_handlers);
static void dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *,
@@ -283,7 +309,7 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp)
/* Destroys the netlink sockets pointed by the elements in 'socksp'
* and frees the 'socksp'. */
static void
vport_del_socksp(struct nl_sock **socksp, uint32_t n_socks)
vport_del_socksp__(struct nl_sock **socksp, uint32_t n_socks)
{
size_t i;
@@ -297,7 +323,7 @@ vport_del_socksp(struct nl_sock **socksp, uint32_t n_socks)
/* Creates an array of netlink sockets. Returns an array of the
* corresponding pointers. Records the error in 'error'. */
static struct nl_sock **
vport_create_socksp(uint32_t n_socks, int *error)
vport_create_socksp__(uint32_t n_socks, int *error)
{
struct nl_sock **socksp = xzalloc(n_socks * sizeof *socksp);
size_t i;
@@ -312,11 +338,131 @@ vport_create_socksp(uint32_t n_socks, int *error)
return socksp;
error:
vport_del_socksp(socksp, n_socks);
vport_del_socksp__(socksp, n_socks);
return NULL;
}
#ifdef _WIN32
static void
vport_delete_sock_pool(struct dpif_handler *handler)
OVS_REQ_WRLOCK(dpif->upcall_lock)
{
if (handler->vport_sock_pool) {
uint32_t i;
struct dpif_windows_vport_sock *sock_pool =
handler->vport_sock_pool;
for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
if (sock_pool[i].nl_sock) {
nl_sock_unsubscribe_packets(sock_pool[i].nl_sock);
nl_sock_destroy(sock_pool[i].nl_sock);
sock_pool[i].nl_sock = NULL;
}
}
free(handler->vport_sock_pool);
handler->vport_sock_pool = NULL;
}
}
static int
vport_create_sock_pool(struct dpif_handler *handler)
OVS_REQ_WRLOCK(dpif->upcall_lock)
{
struct dpif_windows_vport_sock *sock_pool;
size_t i;
int error = 0;
sock_pool = xzalloc(VPORT_SOCK_POOL_SIZE * sizeof *sock_pool);
for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
error = nl_sock_create(NETLINK_GENERIC, &sock_pool[i].nl_sock);
if (error) {
goto error;
}
/* Enable the netlink socket to receive packets. This is equivalent to
* calling nl_sock_join_mcgroup() to receive events. */
error = nl_sock_subscribe_packets(sock_pool[i].nl_sock);
if (error) {
goto error;
}
}
handler->vport_sock_pool = sock_pool;
handler->last_used_pool_idx = 0;
return 0;
error:
vport_delete_sock_pool(handler);
return error;
}
/* Returns an array pointers to netlink sockets. The sockets are picked from a
* pool. Records the error in 'error'. */
static struct nl_sock **
vport_create_socksp_windows(struct dpif_netlink *dpif, int *error)
OVS_REQ_WRLOCK(dpif->upcall_lock)
{
uint32_t n_socks = dpif->n_handlers;
struct nl_sock **socksp;
size_t i;
ovs_assert(n_socks <= 1);
socksp = xzalloc(n_socks * sizeof *socksp);
/* Pick netlink sockets to use in a round-robin fashion from each
* handler's pool of sockets. */
for (i = 0; i < n_socks; i++) {
struct dpif_handler *handler = &dpif->handlers[i];
struct dpif_windows_vport_sock *sock_pool = handler->vport_sock_pool;
size_t index = handler->last_used_pool_idx;
/* A pool of sockets is allocated when the handler is initialized. */
if (sock_pool == NULL) {
free(socksp);
*error = EINVAL;
return NULL;
}
ovs_assert(index < VPORT_SOCK_POOL_SIZE);
socksp[i] = sock_pool[index].nl_sock;
socksp[i] = sock_pool[index].nl_sock;
ovs_assert(socksp[i]);
index = (index == VPORT_SOCK_POOL_SIZE - 1) ? 0 : index + 1;
handler->last_used_pool_idx = index;
}
return socksp;
}
static void
vport_del_socksp_windows(struct dpif_netlink *dpif, struct nl_sock **socksp)
{
free(socksp);
}
#endif /* _WIN32 */
static struct nl_sock **
vport_create_socksp(struct dpif_netlink *dpif, int *error)
{
#ifdef _WIN32
return vport_create_socksp_windows(dpif, error);
#else
return vport_create_socksp__(dpif->n_handlers, error);
#endif
}
static void
vport_del_socksp(struct dpif_netlink *dpif, struct nl_sock **socksp)
{
#ifdef _WIN32
vport_del_socksp_windows(dpif, socksp);
#else
vport_del_socksp__(socksp, dpif->n_handlers);
#endif
}
/* Given the array of pointers to netlink sockets 'socksp', returns
* the array of corresponding pids. If the 'socksp' is NULL, returns
* a single-element array of value 0. */
@@ -354,6 +500,7 @@ vport_get_pids(struct dpif_netlink *dpif, uint32_t port_idx,
if (!dpif->handlers[0].channels[port_idx].sock) {
return false;
}
ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
pids = xzalloc(dpif->n_handlers * sizeof *pids);
@@ -415,11 +562,7 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
for (i = 0; i < dpif->n_handlers; i++) {
struct dpif_handler *handler = &dpif->handlers[i];
#ifdef _WIN32
/*
* XXX : Map appropiate Windows handle
*/
#else
#ifndef _WIN32
if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(socksp[i]),
&event) < 0) {
error = errno;
@@ -434,11 +577,7 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
error:
for (j = 0; j < i; j++) {
#ifdef _WIN32
/*
* XXX : Map appropiate Windows handle
*/
#else
#ifndef _WIN32
epoll_ctl(dpif->handlers[j].epoll_fd, EPOLL_CTL_DEL,
nl_sock_fd(socksp[j]), NULL);
#endif
@@ -467,16 +606,11 @@ vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
for (i = 0; i < dpif->n_handlers; i++) {
struct dpif_handler *handler = &dpif->handlers[i];
#ifdef _WIN32
/*
* XXX : Map appropiate Windows handle
*/
#else
#ifndef _WIN32
epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
nl_sock_fd(handler->channels[port_idx].sock), NULL);
#endif
nl_sock_destroy(handler->channels[port_idx].sock);
#endif
handler->channels[port_idx].sock = NULL;
handler->event_offset = handler->n_events = 0;
}
@@ -517,13 +651,7 @@ destroy_all_channels(struct dpif_netlink *dpif)
for (i = 0; i < dpif->n_handlers; i++) {
struct dpif_handler *handler = &dpif->handlers[i];
#ifdef _WIN32
/*
* XXX : Map appropiate Windows handle
*/
#else
close(handler->epoll_fd);
#endif
dpif_netlink_handler_uninit(handler);
free(handler->epoll_events);
free(handler->channels);
}
@@ -688,7 +816,7 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
int error = 0;
if (dpif->handlers) {
socksp = vport_create_socksp(dpif->n_handlers, &error);
socksp = vport_create_socksp(dpif, &error);
if (!socksp) {
return error;
}
@@ -702,16 +830,14 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
VLOG_WARN_RL(&error_rl, "%s: cannot create port `%s' because it has "
"unsupported type `%s'",
dpif_name(&dpif->dpif), name, type);
vport_del_socksp(socksp, dpif->n_handlers);
vport_del_socksp(dpif, socksp);
return EINVAL;
}
request.name = name;
if (request.type == OVS_VPORT_TYPE_NETDEV) {
#ifdef _WIN32
/*
* XXX : Map appropiate Windows handle
*/
/* XXX : Map appropiate Windows handle */
#else
netdev_linux_ethtool_set_flag(netdev, ETH_FLAG_LRO, "LRO", false);
#endif
@@ -740,7 +866,7 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
dpif_name(&dpif->dpif), *port_nop);
}
vport_del_socksp(socksp, dpif->n_handlers);
vport_del_socksp(dpif, socksp);
goto exit;
}
@@ -756,7 +882,7 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
request.dp_ifindex = dpif->dp_ifindex;
request.port_no = *port_nop;
dpif_netlink_vport_transact(&request, NULL, NULL);
vport_del_socksp(socksp, dpif->n_handlers);
vport_del_socksp(dpif, socksp);
goto exit;
}
}
@@ -1500,6 +1626,34 @@ dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
}
}
#if _WIN32
static void
dpif_netlink_handler_uninit(struct dpif_handler *handler)
{
vport_delete_sock_pool(handler);
}
static int
dpif_netlink_handler_init(struct dpif_handler *handler)
{
return vport_create_sock_pool(handler);
}
#else
static int
dpif_netlink_handler_init(struct dpif_handler *handler)
{
handler->epoll_fd = epoll_create(10);
return handler->epoll_fd < 0 ? errno : 0;
}
static void
dpif_netlink_handler_uninit(struct dpif_handler *handler)
{
close(handler->epoll_fd);
}
#endif
/* Synchronizes 'channels' in 'dpif->handlers' with the set of vports
* currently in 'dpif' in the kernel, by adding a new set of channels for
* any kernel vport that lacks one and deleting any channels that have no
@@ -1517,30 +1671,30 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
int retval = 0;
size_t i;
ovs_assert(!WINDOWS || n_handlers <= 1);
ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
if (dpif->n_handlers != n_handlers) {
destroy_all_channels(dpif);
dpif->handlers = xzalloc(n_handlers * sizeof *dpif->handlers);
for (i = 0; i < n_handlers; i++) {
int error;
struct dpif_handler *handler = &dpif->handlers[i];
#ifdef _WIN32
/*
* XXX : Map appropiate Windows handle
*/
#else
handler->epoll_fd = epoll_create(10);
if (handler->epoll_fd < 0) {
error = dpif_netlink_handler_init(handler);
if (error) {
size_t j;
struct dpif_handler *tmp = &dpif->handlers[i];
for (j = 0; j < i; j++) {
close(dpif->handlers[j].epoll_fd);
dpif_netlink_handler_uninit(tmp);
}
free(dpif->handlers);
dpif->handlers = NULL;
return errno;
return error;
}
#endif
}
dpif->n_handlers = n_handlers;
}
@@ -1563,8 +1717,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
if (port_no >= dpif->uc_array_size
|| !vport_get_pids(dpif, port_no, &upcall_pids)) {
struct nl_sock **socksp = vport_create_socksp(dpif->n_handlers,
&error);
struct nl_sock **socksp = vport_create_socksp(dpif, &error);
if (!socksp) {
goto error;
@@ -1574,7 +1727,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
if (error) {
VLOG_INFO("%s: could not add channels for port %s",
dpif_name(&dpif->dpif), vport.name);
vport_del_socksp(socksp, dpif->n_handlers);
vport_del_socksp(dpif, socksp);
retval = error;
goto error;
}
@@ -1669,6 +1822,14 @@ dpif_netlink_handlers_set(struct dpif *dpif_, uint32_t n_handlers)
struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
int error = 0;
#ifdef _WIN32
/* Multiple upcall handlers will be supported once kernel datapath supports
* it. */
if (n_handlers > 1) {
return error;
}
#endif
fat_rwlock_wrlock(&dpif->upcall_lock);
if (dpif->handlers) {
error = dpif_netlink_refresh_channels(dpif, n_handlers);
@@ -1754,6 +1915,73 @@ parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
return 0;
}
#ifdef _WIN32
#define PACKET_RECV_BATCH_SIZE 50
static int
dpif_netlink_recv_windows(struct dpif_netlink *dpif, uint32_t handler_id,
struct dpif_upcall *upcall, struct ofpbuf *buf)
OVS_REQ_RDLOCK(dpif->upcall_lock)
{
struct dpif_handler *handler;
int read_tries = 0;
struct dpif_windows_vport_sock *sock_pool;
uint32_t i;
if (!dpif->handlers) {
return EAGAIN;
}
/* Only one handler is supported currently. */
if (handler_id >= 1) {
return EAGAIN;
}
if (handler_id >= dpif->n_handlers) {
return EAGAIN;
}
handler = &dpif->handlers[handler_id];
sock_pool = handler->vport_sock_pool;
for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
for (;;) {
int dp_ifindex;
int error;
if (++read_tries > PACKET_RECV_BATCH_SIZE) {
return EAGAIN;
}
error = nl_sock_recv(sock_pool[i].nl_sock, buf, false);
if (error == ENOBUFS) {
/* ENOBUFS typically means that we've received so many
* packets that the buffer overflowed. Try again
* immediately because there's almost certainly a packet
* waiting for us. */
/* XXX: report_loss(dpif, ch, idx, handler_id); */
continue;
}
/* XXX: ch->last_poll = time_msec(); */
if (error) {
if (error == EAGAIN) {
break;
}
return error;
}
error = parse_odp_packet(buf, upcall, &dp_ifindex);
if (!error && dp_ifindex == dpif->dp_ifindex) {
return 0;
} else if (error) {
return error;
}
}
}
return EAGAIN;
}
#else
static int
dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
struct dpif_upcall *upcall, struct ofpbuf *buf)
@@ -1772,15 +2000,11 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
handler->event_offset = handler->n_events = 0;
#ifdef _WIN32
retval = dpif->uc_array_size;
handler->event_offset = 0;
#else
do {
retval = epoll_wait(handler->epoll_fd, handler->epoll_events,
dpif->uc_array_size, 0);
} while (retval < 0 && errno == EINTR);
#endif
if (retval < 0) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", ovs_strerror(errno));
@@ -1832,6 +2056,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
return EAGAIN;
}
#endif
static int
dpif_netlink_recv(struct dpif *dpif_, uint32_t handler_id,
@@ -1841,7 +2066,11 @@ dpif_netlink_recv(struct dpif *dpif_, uint32_t handler_id,
int error;
fat_rwlock_rdlock(&dpif->upcall_lock);
#ifdef _WIN32
error = dpif_netlink_recv_windows(dpif, handler_id, upcall, buf);
#else
error = dpif_netlink_recv__(dpif, handler_id, upcall, buf);
#endif
fat_rwlock_unlock(&dpif->upcall_lock);
return error;
@@ -1852,9 +2081,18 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
OVS_REQ_RDLOCK(dpif->upcall_lock)
{
#ifdef _WIN32
/*
* XXX : Map appropiate Windows handle
*/
uint32_t i;
struct dpif_windows_vport_sock *sock_pool =
dpif->handlers[handler_id].vport_sock_pool;
/* Only one handler is supported currently. */
if (handler_id >= 1) {
return;
}
for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
}
#else
if (dpif->handlers && handler_id < dpif->n_handlers) {
struct dpif_handler *handler = &dpif->handlers[handler_id];