2009-07-08 13:19:16 -07:00
|
|
|
/*
|
2010-01-22 17:26:31 -05:00
|
|
|
* Copyright (c) 2007, 2008, 2009, 2010 Nicira Networks.
|
2009-06-15 15:11:30 -07:00
|
|
|
* Distributed under the terms of the GNU GPL version 2.
|
|
|
|
*
|
|
|
|
* Significant portions of this file may be copied from parts of the Linux
|
|
|
|
* kernel, by Linus Torvalds and others.
|
2009-07-08 13:19:16 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
/* Functions for managing the dp interface/device. */
|
|
|
|
|
2010-08-30 00:24:54 -07:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/if_arp.h>
|
|
|
|
#include <linux/if_vlan.h>
|
|
|
|
#include <linux/in.h>
|
|
|
|
#include <linux/ip.h>
|
|
|
|
#include <linux/delay.h>
|
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/etherdevice.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/rcupdate.h>
|
|
|
|
#include <linux/tcp.h>
|
|
|
|
#include <linux/udp.h>
|
|
|
|
#include <linux/version.h>
|
|
|
|
#include <linux/ethtool.h>
|
|
|
|
#include <linux/wait.h>
|
|
|
|
#include <asm/system.h>
|
|
|
|
#include <asm/div64.h>
|
|
|
|
#include <asm/bug.h>
|
2010-06-25 17:33:07 +08:00
|
|
|
#include <linux/highmem.h>
|
2009-07-08 13:19:16 -07:00
|
|
|
#include <linux/netfilter_bridge.h>
|
|
|
|
#include <linux/netfilter_ipv4.h>
|
|
|
|
#include <linux/inetdevice.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/rculist.h>
|
|
|
|
#include <linux/dmi.h>
|
2010-03-12 16:05:25 -05:00
|
|
|
#include <net/inet_ecn.h>
|
2010-05-13 15:25:27 -07:00
|
|
|
#include <linux/compat.h>
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
#include "openvswitch/datapath-protocol.h"
|
2010-11-22 14:17:24 -08:00
|
|
|
#include "checksum.h"
|
2009-07-08 13:19:16 -07:00
|
|
|
#include "datapath.h"
|
|
|
|
#include "actions.h"
|
|
|
|
#include "flow.h"
|
2010-10-18 15:30:20 -07:00
|
|
|
#include "loop_counter.h"
|
2010-05-13 15:25:27 -07:00
|
|
|
#include "odp-compat.h"
|
2010-04-02 16:46:18 -04:00
|
|
|
#include "table.h"
|
2010-04-12 15:53:39 -04:00
|
|
|
#include "vport-internal_dev.h"
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
#include "compat.h"
|
|
|
|
|
|
|
|
int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
|
|
|
|
EXPORT_SYMBOL(dp_ioctl_hook);
|
|
|
|
|
|
|
|
/* Datapaths. Protected on the read side by rcu_read_lock, on the write side
|
2009-09-11 10:51:36 -07:00
|
|
|
* by dp_mutex.
|
2009-07-08 13:19:16 -07:00
|
|
|
*
|
|
|
|
* dp_mutex nests inside the RTNL lock: if you need both you must take the RTNL
|
|
|
|
* lock first.
|
|
|
|
*
|
2010-12-03 13:09:26 -08:00
|
|
|
* It is safe to access the datapath and vport structures with just
|
2009-07-08 13:19:16 -07:00
|
|
|
* dp_mutex.
|
|
|
|
*/
|
|
|
|
static struct datapath *dps[ODP_MAX];
|
|
|
|
static DEFINE_MUTEX(dp_mutex);
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
static int new_vport(struct datapath *, struct odp_port *, int port_no);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
/* Must be called with rcu_read_lock or dp_mutex. */
|
|
|
|
struct datapath *get_dp(int dp_idx)
|
|
|
|
{
|
|
|
|
if (dp_idx < 0 || dp_idx >= ODP_MAX)
|
|
|
|
return NULL;
|
|
|
|
return rcu_dereference(dps[dp_idx]);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(get_dp);
|
|
|
|
|
2010-02-10 16:50:51 -08:00
|
|
|
static struct datapath *get_dp_locked(int dp_idx)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct datapath *dp;
|
|
|
|
|
|
|
|
mutex_lock(&dp_mutex);
|
|
|
|
dp = get_dp(dp_idx);
|
|
|
|
if (dp)
|
|
|
|
mutex_lock(&dp->mutex);
|
|
|
|
mutex_unlock(&dp_mutex);
|
|
|
|
return dp;
|
|
|
|
}
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
/* Must be called with rcu_read_lock or RTNL lock. */
|
|
|
|
const char *dp_name(const struct datapath *dp)
|
|
|
|
{
|
2010-12-03 13:09:26 -08:00
|
|
|
return vport_get_name(dp->ports[ODPP_LOCAL]);
|
2010-04-12 15:53:39 -04:00
|
|
|
}
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
static inline size_t br_nlmsg_size(void)
|
|
|
|
{
|
|
|
|
return NLMSG_ALIGN(sizeof(struct ifinfomsg))
|
|
|
|
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
|
|
|
|
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
|
|
|
|
+ nla_total_size(4) /* IFLA_MASTER */
|
|
|
|
+ nla_total_size(4) /* IFLA_MTU */
|
|
|
|
+ nla_total_size(4) /* IFLA_LINK */
|
|
|
|
+ nla_total_size(1); /* IFLA_OPERSTATE */
|
|
|
|
}
|
|
|
|
|
|
|
|
static int dp_fill_ifinfo(struct sk_buff *skb,
|
2010-12-03 13:09:26 -08:00
|
|
|
const struct vport *port,
|
2009-07-08 13:19:16 -07:00
|
|
|
int event, unsigned int flags)
|
|
|
|
{
|
|
|
|
const struct datapath *dp = port->dp;
|
2010-12-03 13:09:26 -08:00
|
|
|
int ifindex = vport_get_ifindex(port);
|
|
|
|
int iflink = vport_get_iflink(port);
|
2009-07-08 13:19:16 -07:00
|
|
|
struct ifinfomsg *hdr;
|
|
|
|
struct nlmsghdr *nlh;
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
if (ifindex < 0)
|
|
|
|
return ifindex;
|
|
|
|
|
|
|
|
if (iflink < 0)
|
|
|
|
return iflink;
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
nlh = nlmsg_put(skb, 0, 0, event, sizeof(*hdr), flags);
|
|
|
|
if (nlh == NULL)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
hdr = nlmsg_data(nlh);
|
|
|
|
hdr->ifi_family = AF_BRIDGE;
|
|
|
|
hdr->__ifi_pad = 0;
|
2010-04-12 15:53:39 -04:00
|
|
|
hdr->ifi_type = ARPHRD_ETHER;
|
|
|
|
hdr->ifi_index = ifindex;
|
2010-12-03 13:09:26 -08:00
|
|
|
hdr->ifi_flags = vport_get_flags(port);
|
2009-07-08 13:19:16 -07:00
|
|
|
hdr->ifi_change = 0;
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
NLA_PUT_STRING(skb, IFLA_IFNAME, vport_get_name(port));
|
|
|
|
NLA_PUT_U32(skb, IFLA_MASTER, vport_get_ifindex(dp->ports[ODPP_LOCAL]));
|
|
|
|
NLA_PUT_U32(skb, IFLA_MTU, vport_get_mtu(port));
|
2009-07-08 13:19:16 -07:00
|
|
|
#ifdef IFLA_OPERSTATE
|
|
|
|
NLA_PUT_U8(skb, IFLA_OPERSTATE,
|
2010-12-03 13:09:26 -08:00
|
|
|
vport_is_running(port)
|
|
|
|
? vport_get_operstate(port)
|
2010-04-12 15:53:39 -04:00
|
|
|
: IF_OPER_DOWN);
|
2009-07-08 13:19:16 -07:00
|
|
|
#endif
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
NLA_PUT(skb, IFLA_ADDRESS, ETH_ALEN, vport_get_addr(port));
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
if (ifindex != iflink)
|
|
|
|
NLA_PUT_U32(skb, IFLA_LINK,iflink);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
return nlmsg_end(skb, nlh);
|
|
|
|
|
|
|
|
nla_put_failure:
|
|
|
|
nlmsg_cancel(skb, nlh);
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
static void dp_ifinfo_notify(int event, struct vport *port)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct sk_buff *skb;
|
|
|
|
int err = -ENOBUFS;
|
|
|
|
|
|
|
|
skb = nlmsg_new(br_nlmsg_size(), GFP_KERNEL);
|
|
|
|
if (skb == NULL)
|
|
|
|
goto errout;
|
|
|
|
|
|
|
|
err = dp_fill_ifinfo(skb, port, event, 0);
|
|
|
|
if (err < 0) {
|
|
|
|
/* -EMSGSIZE implies BUG in br_nlmsg_size() */
|
|
|
|
WARN_ON(err == -EMSGSIZE);
|
|
|
|
kfree_skb(skb);
|
|
|
|
goto errout;
|
|
|
|
}
|
2010-04-12 15:53:39 -04:00
|
|
|
rtnl_notify(skb, &init_net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
|
2009-06-24 14:58:57 -07:00
|
|
|
return;
|
2009-07-08 13:19:16 -07:00
|
|
|
errout:
|
|
|
|
if (err < 0)
|
2010-04-12 15:53:39 -04:00
|
|
|
rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err);
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
datapath: Fix OOPS when dp_sysfs_add_if() fails.
Until now, when dp_sysfs_add_if() failed, the caller ignored the failure.
This is a minor problem, because everything else should continue working,
without sysfs entries for the interface, in theory anyhow. In actual
practice, the error exit path of dp_sysfs_add_if() does a kobject_put(),
and that kobject_put() calls release_nbp(), so that the new port gets
freed. The next reference to the new port (usually in an ovs-vswitchd call
to the ODP_PORT_LIST ioctl) will then use the freed data and probably OOPS.
The fix is to make the datapath code, as opposed to the sysfs code,
responsible for creating and destroying the net_bridge_port kobject. The
dp_sysfs_{add,del}_if() functions then just attach and detach the kobject
to sysfs and their cleanup routines no longer need to destroy the kobject
and indeed we don't care whether dp_sysfs_add_if() really succeeds.
This commit also makes the same transformation to the datapath's ifobj,
for consistency.
It is easy to trigger the OOPS fixed by this commit by adding a network
device A to a datapath, then renaming network device A to B, then renaming
network device C to A, then adding A to the datapath. The last attempt to
add A will fail because a file named /sys/class/net/<datapath>/brif/A
already exists from the time that C was added to the datapath under the
name A.
This commit also adds some compatibility infrastructure, because it moves
code out of #ifdef SUPPORT_SYSFS and it otherwise wouldn't build.
2009-08-05 15:22:25 -07:00
|
|
|
static void release_dp(struct kobject *kobj)
|
|
|
|
{
|
|
|
|
struct datapath *dp = container_of(kobj, struct datapath, ifobj);
|
|
|
|
kfree(dp);
|
|
|
|
}
|
|
|
|
|
2010-02-10 16:50:51 -08:00
|
|
|
static struct kobj_type dp_ktype = {
|
datapath: Fix OOPS when dp_sysfs_add_if() fails.
Until now, when dp_sysfs_add_if() failed, the caller ignored the failure.
This is a minor problem, because everything else should continue working,
without sysfs entries for the interface, in theory anyhow. In actual
practice, the error exit path of dp_sysfs_add_if() does a kobject_put(),
and that kobject_put() calls release_nbp(), so that the new port gets
freed. The next reference to the new port (usually in an ovs-vswitchd call
to the ODP_PORT_LIST ioctl) will then use the freed data and probably OOPS.
The fix is to make the datapath code, as opposed to the sysfs code,
responsible for creating and destroying the net_bridge_port kobject. The
dp_sysfs_{add,del}_if() functions then just attach and detach the kobject
to sysfs and their cleanup routines no longer need to destroy the kobject
and indeed we don't care whether dp_sysfs_add_if() really succeeds.
This commit also makes the same transformation to the datapath's ifobj,
for consistency.
It is easy to trigger the OOPS fixed by this commit by adding a network
device A to a datapath, then renaming network device A to B, then renaming
network device C to A, then adding A to the datapath. The last attempt to
add A will fail because a file named /sys/class/net/<datapath>/brif/A
already exists from the time that C was added to the datapath under the
name A.
This commit also adds some compatibility infrastructure, because it moves
code out of #ifdef SUPPORT_SYSFS and it otherwise wouldn't build.
2009-08-05 15:22:25 -07:00
|
|
|
.release = release_dp
|
|
|
|
};
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
static int create_dp(int dp_idx, const char __user *devnamep)
|
|
|
|
{
|
2010-04-12 15:53:39 -04:00
|
|
|
struct odp_port internal_dev_port;
|
2009-07-08 13:19:16 -07:00
|
|
|
char devname[IFNAMSIZ];
|
|
|
|
struct datapath *dp;
|
|
|
|
int err;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (devnamep) {
|
2010-04-27 10:45:28 -07:00
|
|
|
int retval = strncpy_from_user(devname, devnamep, IFNAMSIZ);
|
|
|
|
if (retval < 0) {
|
|
|
|
err = -EFAULT;
|
2009-07-08 13:19:16 -07:00
|
|
|
goto err;
|
2010-04-27 10:45:28 -07:00
|
|
|
} else if (retval >= IFNAMSIZ) {
|
|
|
|
err = -ENAMETOOLONG;
|
|
|
|
goto err;
|
|
|
|
}
|
2009-07-08 13:19:16 -07:00
|
|
|
} else {
|
|
|
|
snprintf(devname, sizeof devname, "of%d", dp_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
rtnl_lock();
|
|
|
|
mutex_lock(&dp_mutex);
|
|
|
|
err = -ENODEV;
|
|
|
|
if (!try_module_get(THIS_MODULE))
|
|
|
|
goto err_unlock;
|
|
|
|
|
|
|
|
/* Exit early if a datapath with that number already exists.
|
|
|
|
* (We don't use -EEXIST because that's ambiguous with 'devname'
|
|
|
|
* conflicting with an existing network device name.) */
|
|
|
|
err = -EBUSY;
|
|
|
|
if (get_dp(dp_idx))
|
|
|
|
goto err_put_module;
|
|
|
|
|
|
|
|
err = -ENOMEM;
|
|
|
|
dp = kzalloc(sizeof *dp, GFP_KERNEL);
|
|
|
|
if (dp == NULL)
|
|
|
|
goto err_put_module;
|
2009-07-08 11:31:59 -07:00
|
|
|
INIT_LIST_HEAD(&dp->port_list);
|
2009-07-08 13:19:16 -07:00
|
|
|
mutex_init(&dp->mutex);
|
|
|
|
dp->dp_idx = dp_idx;
|
|
|
|
for (i = 0; i < DP_N_QUEUES; i++)
|
|
|
|
skb_queue_head_init(&dp->queues[i]);
|
|
|
|
init_waitqueue_head(&dp->waitqueue);
|
|
|
|
|
datapath: Fix OOPS when dp_sysfs_add_if() fails.
Until now, when dp_sysfs_add_if() failed, the caller ignored the failure.
This is a minor problem, because everything else should continue working,
without sysfs entries for the interface, in theory anyhow. In actual
practice, the error exit path of dp_sysfs_add_if() does a kobject_put(),
and that kobject_put() calls release_nbp(), so that the new port gets
freed. The next reference to the new port (usually in an ovs-vswitchd call
to the ODP_PORT_LIST ioctl) will then use the freed data and probably OOPS.
The fix is to make the datapath code, as opposed to the sysfs code,
responsible for creating and destroying the net_bridge_port kobject. The
dp_sysfs_{add,del}_if() functions then just attach and detach the kobject
to sysfs and their cleanup routines no longer need to destroy the kobject
and indeed we don't care whether dp_sysfs_add_if() really succeeds.
This commit also makes the same transformation to the datapath's ifobj,
for consistency.
It is easy to trigger the OOPS fixed by this commit by adding a network
device A to a datapath, then renaming network device A to B, then renaming
network device C to A, then adding A to the datapath. The last attempt to
add A will fail because a file named /sys/class/net/<datapath>/brif/A
already exists from the time that C was added to the datapath under the
name A.
This commit also adds some compatibility infrastructure, because it moves
code out of #ifdef SUPPORT_SYSFS and it otherwise wouldn't build.
2009-08-05 15:22:25 -07:00
|
|
|
/* Initialize kobject for bridge. This will be added as
|
2009-08-14 13:41:44 -07:00
|
|
|
* /sys/class/net/<devname>/brif later, if sysfs is enabled. */
|
datapath: Fix OOPS when dp_sysfs_add_if() fails.
Until now, when dp_sysfs_add_if() failed, the caller ignored the failure.
This is a minor problem, because everything else should continue working,
without sysfs entries for the interface, in theory anyhow. In actual
practice, the error exit path of dp_sysfs_add_if() does a kobject_put(),
and that kobject_put() calls release_nbp(), so that the new port gets
freed. The next reference to the new port (usually in an ovs-vswitchd call
to the ODP_PORT_LIST ioctl) will then use the freed data and probably OOPS.
The fix is to make the datapath code, as opposed to the sysfs code,
responsible for creating and destroying the net_bridge_port kobject. The
dp_sysfs_{add,del}_if() functions then just attach and detach the kobject
to sysfs and their cleanup routines no longer need to destroy the kobject
and indeed we don't care whether dp_sysfs_add_if() really succeeds.
This commit also makes the same transformation to the datapath's ifobj,
for consistency.
It is easy to trigger the OOPS fixed by this commit by adding a network
device A to a datapath, then renaming network device A to B, then renaming
network device C to A, then adding A to the datapath. The last attempt to
add A will fail because a file named /sys/class/net/<datapath>/brif/A
already exists from the time that C was added to the datapath under the
name A.
This commit also adds some compatibility infrastructure, because it moves
code out of #ifdef SUPPORT_SYSFS and it otherwise wouldn't build.
2009-08-05 15:22:25 -07:00
|
|
|
dp->ifobj.kset = NULL;
|
|
|
|
kobject_init(&dp->ifobj, &dp_ktype);
|
|
|
|
|
2009-07-08 11:31:59 -07:00
|
|
|
/* Allocate table. */
|
|
|
|
err = -ENOMEM;
|
2010-04-02 16:46:18 -04:00
|
|
|
rcu_assign_pointer(dp->table, tbl_create(0));
|
2009-07-08 11:31:59 -07:00
|
|
|
if (!dp->table)
|
|
|
|
goto err_free_dp;
|
|
|
|
|
2009-10-26 14:41:32 -07:00
|
|
|
/* Set up our datapath device. */
|
2010-04-27 10:43:24 -07:00
|
|
|
BUILD_BUG_ON(sizeof(internal_dev_port.devname) != sizeof(devname));
|
|
|
|
strcpy(internal_dev_port.devname, devname);
|
2010-12-03 14:41:38 -08:00
|
|
|
strcpy(internal_dev_port.type, "internal");
|
2010-12-03 13:09:26 -08:00
|
|
|
err = new_vport(dp, &internal_dev_port, ODPP_LOCAL);
|
2009-07-08 11:31:59 -07:00
|
|
|
if (err) {
|
2010-04-12 15:53:39 -04:00
|
|
|
if (err == -EBUSY)
|
|
|
|
err = -EEXIST;
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
goto err_destroy_table;
|
2009-07-08 11:31:59 -07:00
|
|
|
}
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
dp->drop_frags = 0;
|
|
|
|
dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
|
|
|
|
if (!dp->stats_percpu)
|
|
|
|
goto err_destroy_local_port;
|
|
|
|
|
|
|
|
rcu_assign_pointer(dps[dp_idx], dp);
|
2010-12-08 12:02:42 -08:00
|
|
|
dp_sysfs_add_dp(dp);
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
mutex_unlock(&dp_mutex);
|
|
|
|
rtnl_unlock();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_destroy_local_port:
|
2010-12-03 14:41:38 -08:00
|
|
|
dp_detach_port(dp->ports[ODPP_LOCAL]);
|
2009-07-08 13:19:16 -07:00
|
|
|
err_destroy_table:
|
2010-04-02 16:46:18 -04:00
|
|
|
tbl_destroy(dp->table, NULL);
|
2009-07-08 13:19:16 -07:00
|
|
|
err_free_dp:
|
|
|
|
kfree(dp);
|
|
|
|
err_put_module:
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
err_unlock:
|
|
|
|
mutex_unlock(&dp_mutex);
|
|
|
|
rtnl_unlock();
|
|
|
|
err:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
datapath: Fix race against workqueue in dp_dev and simplify code.
The dp_dev_destroy() function failed to cancel the xmit_queue work, which
allowed it to run after the device had been destroyed, accessing freed
memory. However, simply canceling the work with cancel_work_sync() would
be insufficient, since other packets could get queued while the work
function was running. Stopping the queue with netif_tx_disable() doesn't
help, because the final action in dp_dev_do_xmit() is to re-enable the TX
queue.
This issue led me to re-examine why the dp_dev needs to use a work_struct
at all. This was implemented in commit 71f13ed0b "Send of0 packets from
workqueue, to avoid recursive locking of ofN device" due to a complaint
from lockdep about recursive locking.
However, there's no actual reason that we need any locking around
dp_dev_xmit(). Until now, it has accepted the standard locking provided
by the network stack. But looking at the other software devices (veth,
loopback), those use NETIF_F_LLTX, which disables this locking, and
presumably do so for this very reason. In fact, the lwn article at
http://lwn.net/Articles/121566/ hints that NETIF_F_LLTX, which is otherwise
discouraged in the kernel, is acceptable for "certain types of software
device."
So this commit switches to using NETIF_F_LLTX for dp_dev and gets rid
of the work_struct.
In the process, I noticed that veth and loopback also take advantage of
a network device destruction "hook" using the net_device "destructor"
member. Using this we can automatically get called on network device
destruction at the point where rtnl_unlock() is called. This allows us
to stop stringing the dp_devs that are being destroyed onto a list so
that we can free them, and thus simplifies the code along all the paths
that call dp_dev_destroy().
This commit gets rid of a call to synchronize_rcu() (disguised as a call
to synchronize_net(), which is a macro that expands to synchronize_rcu()),
so it probably speeds up deleting ports, too.
2009-07-08 12:23:32 -07:00
|
|
|
static void do_destroy_dp(struct datapath *dp)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-12-03 13:09:26 -08:00
|
|
|
struct vport *p, *n;
|
2009-07-08 13:19:16 -07:00
|
|
|
int i;
|
|
|
|
|
2009-06-26 14:15:04 -07:00
|
|
|
list_for_each_entry_safe (p, n, &dp->port_list, node)
|
|
|
|
if (p->port_no != ODPP_LOCAL)
|
2010-12-03 14:41:38 -08:00
|
|
|
dp_detach_port(p);
|
2009-06-26 14:15:04 -07:00
|
|
|
|
2009-08-05 12:56:23 -07:00
|
|
|
dp_sysfs_del_dp(dp);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
rcu_assign_pointer(dps[dp->dp_idx], NULL);
|
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
dp_detach_port(dp->ports[ODPP_LOCAL]);
|
2009-06-26 14:15:04 -07:00
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
tbl_destroy(dp->table, flow_free_tbl);
|
2009-06-26 14:15:04 -07:00
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
for (i = 0; i < DP_N_QUEUES; i++)
|
|
|
|
skb_queue_purge(&dp->queues[i]);
|
|
|
|
free_percpu(dp->stats_percpu);
|
datapath: Fix OOPS when dp_sysfs_add_if() fails.
Until now, when dp_sysfs_add_if() failed, the caller ignored the failure.
This is a minor problem, because everything else should continue working,
without sysfs entries for the interface, in theory anyhow. In actual
practice, the error exit path of dp_sysfs_add_if() does a kobject_put(),
and that kobject_put() calls release_nbp(), so that the new port gets
freed. The next reference to the new port (usually in an ovs-vswitchd call
to the ODP_PORT_LIST ioctl) will then use the freed data and probably OOPS.
The fix is to make the datapath code, as opposed to the sysfs code,
responsible for creating and destroying the net_bridge_port kobject. The
dp_sysfs_{add,del}_if() functions then just attach and detach the kobject
to sysfs and their cleanup routines no longer need to destroy the kobject
and indeed we don't care whether dp_sysfs_add_if() really succeeds.
This commit also makes the same transformation to the datapath's ifobj,
for consistency.
It is easy to trigger the OOPS fixed by this commit by adding a network
device A to a datapath, then renaming network device A to B, then renaming
network device C to A, then adding A to the datapath. The last attempt to
add A will fail because a file named /sys/class/net/<datapath>/brif/A
already exists from the time that C was added to the datapath under the
name A.
This commit also adds some compatibility infrastructure, because it moves
code out of #ifdef SUPPORT_SYSFS and it otherwise wouldn't build.
2009-08-05 15:22:25 -07:00
|
|
|
kobject_put(&dp->ifobj);
|
2009-07-08 13:19:16 -07:00
|
|
|
module_put(THIS_MODULE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int destroy_dp(int dp_idx)
|
|
|
|
{
|
|
|
|
struct datapath *dp;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
rtnl_lock();
|
|
|
|
mutex_lock(&dp_mutex);
|
|
|
|
dp = get_dp(dp_idx);
|
|
|
|
err = -ENODEV;
|
|
|
|
if (!dp)
|
|
|
|
goto err_unlock;
|
|
|
|
|
datapath: Fix race against workqueue in dp_dev and simplify code.
The dp_dev_destroy() function failed to cancel the xmit_queue work, which
allowed it to run after the device had been destroyed, accessing freed
memory. However, simply canceling the work with cancel_work_sync() would
be insufficient, since other packets could get queued while the work
function was running. Stopping the queue with netif_tx_disable() doesn't
help, because the final action in dp_dev_do_xmit() is to re-enable the TX
queue.
This issue led me to re-examine why the dp_dev needs to use a work_struct
at all. This was implemented in commit 71f13ed0b "Send of0 packets from
workqueue, to avoid recursive locking of ofN device" due to a complaint
from lockdep about recursive locking.
However, there's no actual reason that we need any locking around
dp_dev_xmit(). Until now, it has accepted the standard locking provided
by the network stack. But looking at the other software devices (veth,
loopback), those use NETIF_F_LLTX, which disables this locking, and
presumably do so for this very reason. In fact, the lwn article at
http://lwn.net/Articles/121566/ hints that NETIF_F_LLTX, which is otherwise
discouraged in the kernel, is acceptable for "certain types of software
device."
So this commit switches to using NETIF_F_LLTX for dp_dev and gets rid
of the work_struct.
In the process, I noticed that veth and loopback also take advantage of
a network device destruction "hook" using the net_device "destructor"
member. Using this we can automatically get called on network device
destruction at the point where rtnl_unlock() is called. This allows us
to stop stringing the dp_devs that are being destroyed onto a list so
that we can free them, and thus simplifies the code along all the paths
that call dp_dev_destroy().
This commit gets rid of a call to synchronize_rcu() (disguised as a call
to synchronize_net(), which is a macro that expands to synchronize_rcu()),
so it probably speeds up deleting ports, too.
2009-07-08 12:23:32 -07:00
|
|
|
do_destroy_dp(dp);
|
2009-07-08 13:19:16 -07:00
|
|
|
err = 0;
|
|
|
|
|
|
|
|
err_unlock:
|
|
|
|
mutex_unlock(&dp_mutex);
|
|
|
|
rtnl_unlock();
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called with RTNL lock and dp_mutex. */
|
2010-12-03 13:09:26 -08:00
|
|
|
static int new_vport(struct datapath *dp, struct odp_port *odp_port, int port_no)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-12-03 14:41:38 -08:00
|
|
|
struct vport_parms parms;
|
2010-04-12 15:53:39 -04:00
|
|
|
struct vport *vport;
|
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
parms.name = odp_port->devname;
|
|
|
|
parms.type = odp_port->type;
|
|
|
|
parms.config = odp_port->config;
|
2010-12-03 13:09:26 -08:00
|
|
|
parms.dp = dp;
|
|
|
|
parms.port_no = port_no;
|
2010-04-12 15:53:39 -04:00
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
vport_lock();
|
|
|
|
vport = vport_add(&parms);
|
|
|
|
vport_unlock();
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
if (IS_ERR(vport))
|
|
|
|
return PTR_ERR(vport);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
rcu_assign_pointer(dp->ports[port_no], vport);
|
|
|
|
list_add_rcu(&vport->node, &dp->port_list);
|
2009-07-08 13:19:16 -07:00
|
|
|
dp->n_ports++;
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
dp_ifinfo_notify(RTM_NEWLINK, vport);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
static int attach_port(int dp_idx, struct odp_port __user *portp)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct datapath *dp;
|
|
|
|
struct odp_port port;
|
|
|
|
int port_no;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = -EFAULT;
|
|
|
|
if (copy_from_user(&port, portp, sizeof port))
|
|
|
|
goto out;
|
|
|
|
port.devname[IFNAMSIZ - 1] = '\0';
|
2010-12-03 14:41:38 -08:00
|
|
|
port.type[VPORT_TYPE_SIZE - 1] = '\0';
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
rtnl_lock();
|
|
|
|
dp = get_dp_locked(dp_idx);
|
|
|
|
err = -ENODEV;
|
|
|
|
if (!dp)
|
|
|
|
goto out_unlock_rtnl;
|
|
|
|
|
2009-06-17 14:26:19 -07:00
|
|
|
for (port_no = 1; port_no < DP_MAX_PORTS; port_no++)
|
|
|
|
if (!dp->ports[port_no])
|
|
|
|
goto got_port_no;
|
2009-08-25 14:12:01 -07:00
|
|
|
err = -EFBIG;
|
2009-06-17 14:26:19 -07:00
|
|
|
goto out_unlock_dp;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2009-06-17 14:26:19 -07:00
|
|
|
got_port_no:
|
2010-12-03 13:09:26 -08:00
|
|
|
err = new_vport(dp, &port, port_no);
|
2009-07-08 13:19:16 -07:00
|
|
|
if (err)
|
2010-04-12 15:53:39 -04:00
|
|
|
goto out_unlock_dp;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-04-26 18:08:54 -07:00
|
|
|
set_internal_devs_mtu(dp);
|
2009-08-05 12:56:23 -07:00
|
|
|
dp_sysfs_add_if(dp->ports[port_no]);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-05-13 13:18:22 -07:00
|
|
|
err = put_user(port_no, &portp->port);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
out_unlock_dp:
|
|
|
|
mutex_unlock(&dp->mutex);
|
|
|
|
out_unlock_rtnl:
|
|
|
|
rtnl_unlock();
|
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
int dp_detach_port(struct vport *p)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-04-12 15:53:39 -04:00
|
|
|
int err;
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
ASSERT_RTNL();
|
|
|
|
|
datapath: Move sysfs support from brcompat_mod into openvswitch_mod.
In the past problems have arisen due to the different ways that datapaths
are created and destroyed in the three different cases:
1. sysfs supported, brcompat_mod loaded.
2. sysfs supported, brcompat_mod not loaded.
3. sysfs not supported.
The brcompat_mod loaded versus not loaded distinction is the stickiest
because we have to do all the calls into brcompat_mod through hook
functions, which in turn causes pressure to keep the number of hook
functions small and well-defined, which makes it really difficult to put
the hook call points at exactly the right place. Witness, for example,
this piece of code in datapath.c:
int dp_del_port(struct net_bridge_port *p)
{
ASSERT_RTNL();
#ifdef SUPPORT_SYSFS
if (p->port_no != ODPP_LOCAL && dp_del_if_hook)
sysfs_remove_link(&p->dp->ifobj, p->dev->name);
#endif
The code inside the #ifdef is logically part of the brcompat_mod sysfs
support, but the author of this code (quite reasonably) didn't want to
add a hook function call there. After all, what would you call the
hook function? There's no obvious name from the dp_del_port() caller's
perspective.
All this argues that sysfs support should be in openvswitch_mod itself,
since it has to be tightly integrated, not bolted on. So this commit
moves it there.
Now, this is not to say that openvswitch_mod should actually be
implementing bridge-compatible sysfs. In the future, it probably should
not be; rather, it should implement something appropriate for Open vSwitch
datapaths instead. But right now we have bridge-compatible sysfs, and so
that's what this commit moves.
2009-08-05 12:51:30 -07:00
|
|
|
if (p->port_no != ODPP_LOCAL)
|
2009-08-05 14:36:21 -07:00
|
|
|
dp_sysfs_del_if(p);
|
2009-07-08 13:19:16 -07:00
|
|
|
dp_ifinfo_notify(RTM_DELLINK, p);
|
|
|
|
|
|
|
|
/* First drop references to device. */
|
2010-04-12 15:53:39 -04:00
|
|
|
p->dp->n_ports--;
|
2009-07-08 13:19:16 -07:00
|
|
|
list_del_rcu(&p->node);
|
|
|
|
rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
|
2010-04-12 15:53:39 -04:00
|
|
|
|
2010-12-03 15:44:51 -08:00
|
|
|
/* Then destroy it. */
|
2010-12-03 14:41:38 -08:00
|
|
|
vport_lock();
|
2010-12-03 15:44:51 -08:00
|
|
|
err = vport_del(p);
|
2010-12-03 14:41:38 -08:00
|
|
|
vport_unlock();
|
2010-04-12 15:53:39 -04:00
|
|
|
|
2010-12-03 15:44:51 -08:00
|
|
|
return err;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
static int detach_port(int dp_idx, int port_no)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-12-03 13:09:26 -08:00
|
|
|
struct vport *p;
|
2009-07-08 13:19:16 -07:00
|
|
|
struct datapath *dp;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = -EINVAL;
|
|
|
|
if (port_no < 0 || port_no >= DP_MAX_PORTS || port_no == ODPP_LOCAL)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
rtnl_lock();
|
|
|
|
dp = get_dp_locked(dp_idx);
|
|
|
|
err = -ENODEV;
|
|
|
|
if (!dp)
|
|
|
|
goto out_unlock_rtnl;
|
|
|
|
|
|
|
|
p = dp->ports[port_no];
|
|
|
|
err = -ENOENT;
|
|
|
|
if (!p)
|
|
|
|
goto out_unlock_dp;
|
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
err = dp_detach_port(p);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
out_unlock_dp:
|
|
|
|
mutex_unlock(&dp->mutex);
|
|
|
|
out_unlock_rtnl:
|
|
|
|
rtnl_unlock();
|
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2010-05-12 12:40:45 -07:00
|
|
|
/* Must be called with rcu_read_lock. */
|
2010-12-03 13:09:26 -08:00
|
|
|
void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct datapath *dp = p->dp;
|
|
|
|
struct dp_stats_percpu *stats;
|
2010-05-12 12:40:45 -07:00
|
|
|
int stats_counter_off;
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
struct sw_flow_actions *acts;
|
|
|
|
struct loop_counter *loop;
|
2010-08-27 12:32:05 -07:00
|
|
|
int error;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
OVS_CB(skb)->vport = p;
|
2010-02-28 12:17:16 -05:00
|
|
|
|
2010-08-29 10:49:11 -07:00
|
|
|
if (!OVS_CB(skb)->flow) {
|
|
|
|
struct odp_flow_key key;
|
|
|
|
struct tbl_node *flow_node;
|
2010-08-29 14:28:58 -07:00
|
|
|
bool is_frag;
|
2010-08-27 12:32:05 -07:00
|
|
|
|
2010-08-29 10:49:11 -07:00
|
|
|
/* Extract flow from 'skb' into 'key'. */
|
2010-08-29 14:28:58 -07:00
|
|
|
error = flow_extract(skb, p ? p->port_no : ODPP_NONE, &key, &is_frag);
|
2010-08-29 10:49:11 -07:00
|
|
|
if (unlikely(error)) {
|
|
|
|
kfree_skb(skb);
|
|
|
|
return;
|
|
|
|
}
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-08-29 14:28:58 -07:00
|
|
|
if (is_frag && dp->drop_frags) {
|
2010-08-29 10:49:11 -07:00
|
|
|
kfree_skb(skb);
|
|
|
|
stats_counter_off = offsetof(struct dp_stats_percpu, n_frags);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Look up flow. */
|
|
|
|
flow_node = tbl_lookup(rcu_dereference(dp->table), &key,
|
|
|
|
flow_hash(&key), flow_cmp);
|
|
|
|
if (unlikely(!flow_node)) {
|
|
|
|
dp_output_control(dp, skb, _ODPL_MISS_NR, OVS_CB(skb)->tun_id);
|
|
|
|
stats_counter_off = offsetof(struct dp_stats_percpu, n_missed);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
OVS_CB(skb)->flow = flow_cast(flow_node);
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
}
|
|
|
|
|
2010-08-29 10:49:11 -07:00
|
|
|
flow_used(OVS_CB(skb)->flow, skb);
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
|
2010-08-29 10:49:11 -07:00
|
|
|
acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
|
|
|
|
/* Check whether we've looped too much. */
|
2010-10-18 15:30:20 -07:00
|
|
|
loop = loop_get_counter();
|
|
|
|
if (unlikely(++loop->count > MAX_LOOPS))
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
loop->looping = true;
|
|
|
|
if (unlikely(loop->looping)) {
|
2010-10-18 15:30:20 -07:00
|
|
|
loop_suppress(dp, acts);
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
goto out_loop;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
2010-05-12 12:40:45 -07:00
|
|
|
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
/* Execute actions. */
|
2010-08-29 10:49:11 -07:00
|
|
|
execute_actions(dp, skb, &OVS_CB(skb)->flow->key, acts->actions,
|
2010-12-10 10:40:58 -08:00
|
|
|
acts->actions_len);
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
stats_counter_off = offsetof(struct dp_stats_percpu, n_hit);
|
|
|
|
|
|
|
|
/* Check whether sub-actions looped too much. */
|
|
|
|
if (unlikely(loop->looping))
|
2010-10-18 15:30:20 -07:00
|
|
|
loop_suppress(dp, acts);
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
|
|
|
|
out_loop:
|
|
|
|
/* Decrement loop counter. */
|
|
|
|
if (!--loop->count)
|
|
|
|
loop->looping = false;
|
2010-10-18 15:30:20 -07:00
|
|
|
loop_put_counter();
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
|
2010-05-12 12:40:45 -07:00
|
|
|
out:
|
datapath: Detect and suppress flows that are implicated in loops.
In-kernel loops need to be suppressed; otherwise, they cause high CPU
consumption, even to the point that the machine becomes unusable. Ideally
these flows should never be added to the Open vSwitch flow table, but it
is fairly easy for a buggy controller to create them given the menagerie
of tunnels, patches, etc. that OVS makes available.
Commit ecbb6953b "datapath: Add loop checking" did the initial work
toward suppressing loops, by dropping packets that recursed more than 5
times. This at least prevented the kernel stack from overflowing and
thereby OOPSing the machine. But even with this commit, it is still
possible to waste a lot of CPU time due to loops. The problem is not
limited to 5 recursive calls per packet: any packet can be sent to
multiple destinations, which in turn can themselves be sent to multiple
destinations, and so on. We have actually seen in practice a case where
each packet was, apparently, sent to at least 2 destinations per hop, so
that each packet actually consumed CPU time for 2**5 == 32 packets,
possibly more.
This commit takes loop suppression a step further, by clearing the actions
of flows that are implicated in loops. Thus, after the first packet in
such a flow, later packets for either the "root" flow or for flows that
it ends up looping through are simply discarded, saving a huge amount of
CPU time.
This version of the commit just clears the actions from the flows that a
part of the loop. Probably, there should be some additional action to tell
ovs-vswitchd that a loop has been detected, so that it can in turn inform
the controller one way or another.
My test case was this:
ovs-controller -H --max-idle=permanent punix:/tmp/controller
ovs-vsctl -- \
set-controller br0 unix:/tmp/controller -- \
add-port br0 patch00 -- \
add-port br0 patch01 -- \
add-port br0 patch10 -- \
add-port br0 patch11 -- \
add-port br0 patch20 -- \
add-port br0 patch21 -- \
add-port br0 patch30 -- \
add-port br0 patch31 -- \
set Interface patch00 type=patch options:peer=patch01 -- \
set Interface patch01 type=patch options:peer=patch00 -- \
set Interface patch10 type=patch options:peer=patch11 -- \
set Interface patch11 type=patch options:peer=patch10 -- \
set Interface patch20 type=patch options:peer=patch21 -- \
set Interface patch21 type=patch options:peer=patch20 -- \
set Interface patch30 type=patch options:peer=patch31 -- \
set Interface patch31 type=patch options:peer=patch30
followed by sending a single "ping" packet from an attached Ethernet
port into the bridge. After this, without this commit the vswitch
userspace and kernel consume 50-75% of the machine's CPU (in my KVM
test setup on a single physical host); with this commit, some CPU is
consumed initially but it converges on 0% quickly.
A more challenging test sends a series of packets in multiple flows;
I used "hping3" with its default options. Without this commit, the
vswitch consumes 100% of the machine's CPU, most of which is in the
kernel. With this commit, the vswitch consumes "only" 33-50% CPU,
most of which is in userspace, so the machine is more responsive.
A refinement on this commit would be to pass the loop counter down to
userspace as part of the odp_msg struct and then back up as part of
the ODP_EXECUTE command arguments. This would, presumably, reduce
the CPU requirements, since it would allow loop detection to happen
earlier, during initial setup of flows, instead of just on the second
and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
|
|
|
/* Update datapath statistics. */
|
2010-05-12 12:40:45 -07:00
|
|
|
local_bh_disable();
|
|
|
|
stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
|
2010-07-28 18:20:43 -07:00
|
|
|
|
|
|
|
write_seqcount_begin(&stats->seqlock);
|
2010-05-12 12:40:45 -07:00
|
|
|
(*(u64 *)((u8 *)stats + stats_counter_off))++;
|
2010-07-28 18:20:43 -07:00
|
|
|
write_seqcount_end(&stats->seqlock);
|
|
|
|
|
2010-05-12 12:40:45 -07:00
|
|
|
local_bh_enable();
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
2009-09-14 09:20:58 -07:00
|
|
|
/* Append each packet in 'skb' list to 'queue'. There will be only one packet
|
|
|
|
* unless we broke up a GSO packet. */
|
2010-07-14 19:27:18 -07:00
|
|
|
static int queue_control_packets(struct sk_buff *skb, struct sk_buff_head *queue,
|
2010-12-10 10:42:42 -08:00
|
|
|
int queue_no, u64 arg)
|
2009-09-14 09:20:58 -07:00
|
|
|
{
|
|
|
|
struct sk_buff *nskb;
|
|
|
|
int port_no;
|
|
|
|
int err;
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
if (OVS_CB(skb)->vport)
|
|
|
|
port_no = OVS_CB(skb)->vport->port_no;
|
2010-04-12 15:53:39 -04:00
|
|
|
else
|
|
|
|
port_no = ODPP_LOCAL;
|
2009-09-14 09:20:58 -07:00
|
|
|
|
|
|
|
do {
|
|
|
|
struct odp_msg *header;
|
|
|
|
|
|
|
|
nskb = skb->next;
|
|
|
|
skb->next = NULL;
|
|
|
|
|
|
|
|
err = skb_cow(skb, sizeof *header);
|
|
|
|
if (err)
|
|
|
|
goto err_kfree_skbs;
|
|
|
|
|
|
|
|
header = (struct odp_msg*)__skb_push(skb, sizeof *header);
|
|
|
|
header->type = queue_no;
|
|
|
|
header->length = skb->len;
|
|
|
|
header->port = port_no;
|
|
|
|
header->arg = arg;
|
|
|
|
skb_queue_tail(queue, skb);
|
|
|
|
|
|
|
|
skb = nskb;
|
|
|
|
} while (skb);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_kfree_skbs:
|
|
|
|
kfree_skb(skb);
|
|
|
|
while ((skb = nskb) != NULL) {
|
|
|
|
nskb = skb->next;
|
|
|
|
kfree_skb(skb);
|
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2010-07-14 19:27:18 -07:00
|
|
|
int dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
|
2010-12-10 10:42:42 -08:00
|
|
|
u64 arg)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct dp_stats_percpu *stats;
|
|
|
|
struct sk_buff_head *queue;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(skb_shared(skb));
|
2010-01-04 13:08:37 -08:00
|
|
|
BUG_ON(queue_no != _ODPL_MISS_NR && queue_no != _ODPL_ACTION_NR && queue_no != _ODPL_SFLOW_NR);
|
2009-07-08 13:19:16 -07:00
|
|
|
queue = &dp->queues[queue_no];
|
|
|
|
err = -ENOBUFS;
|
|
|
|
if (skb_queue_len(queue) >= DP_MAX_QUEUE_LEN)
|
|
|
|
goto err_kfree_skb;
|
|
|
|
|
2010-01-22 17:26:31 -05:00
|
|
|
forward_ip_summed(skb);
|
|
|
|
|
2010-06-17 15:15:11 -07:00
|
|
|
err = vswitch_skb_checksum_setup(skb);
|
|
|
|
if (err)
|
|
|
|
goto err_kfree_skb;
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
/* Break apart GSO packets into their component pieces. Otherwise
|
|
|
|
* userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
|
|
|
|
if (skb_is_gso(skb)) {
|
2010-06-17 15:20:16 -07:00
|
|
|
struct sk_buff *nskb = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
|
2010-12-08 23:55:20 -08:00
|
|
|
|
|
|
|
kfree_skb(skb);
|
|
|
|
skb = nskb;
|
|
|
|
if (unlikely(IS_ERR(skb))) {
|
|
|
|
err = PTR_ERR(skb);
|
|
|
|
goto err;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-09-14 09:20:58 -07:00
|
|
|
err = queue_control_packets(skb, queue, queue_no, arg);
|
2009-07-08 13:19:16 -07:00
|
|
|
wake_up_interruptible(&dp->waitqueue);
|
2009-09-14 09:20:58 -07:00
|
|
|
return err;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
err_kfree_skb:
|
|
|
|
kfree_skb(skb);
|
|
|
|
err:
|
2010-05-12 11:40:58 -07:00
|
|
|
local_bh_disable();
|
|
|
|
stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
|
2010-07-28 18:20:43 -07:00
|
|
|
|
|
|
|
write_seqcount_begin(&stats->seqlock);
|
2009-07-08 13:19:16 -07:00
|
|
|
stats->n_lost++;
|
2010-07-28 18:20:43 -07:00
|
|
|
write_seqcount_end(&stats->seqlock);
|
|
|
|
|
2010-05-12 11:40:58 -07:00
|
|
|
local_bh_enable();
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int flush_flows(struct datapath *dp)
|
|
|
|
{
|
2010-04-02 16:46:18 -04:00
|
|
|
struct tbl *old_table = rcu_dereference(dp->table);
|
|
|
|
struct tbl *new_table;
|
|
|
|
|
|
|
|
new_table = tbl_create(0);
|
|
|
|
if (!new_table)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
rcu_assign_pointer(dp->table, new_table);
|
|
|
|
|
|
|
|
tbl_deferred_destroy(old_table, flow_free_tbl);
|
|
|
|
|
|
|
|
return 0;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
static int validate_actions(const struct nlattr *actions, u32 actions_len)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-12-10 10:40:58 -08:00
|
|
|
const struct nlattr *a;
|
|
|
|
int rem;
|
|
|
|
|
|
|
|
nla_for_each_attr(a, actions, actions_len, rem) {
|
|
|
|
static const u32 action_lens[ODPAT_MAX + 1] = {
|
|
|
|
[ODPAT_OUTPUT] = 4,
|
2010-12-10 10:42:42 -08:00
|
|
|
[ODPAT_CONTROLLER] = 8,
|
2010-12-10 10:40:58 -08:00
|
|
|
[ODPAT_SET_DL_TCI] = 2,
|
|
|
|
[ODPAT_STRIP_VLAN] = 0,
|
|
|
|
[ODPAT_SET_DL_SRC] = ETH_ALEN,
|
|
|
|
[ODPAT_SET_DL_DST] = ETH_ALEN,
|
|
|
|
[ODPAT_SET_NW_SRC] = 4,
|
|
|
|
[ODPAT_SET_NW_DST] = 4,
|
|
|
|
[ODPAT_SET_NW_TOS] = 1,
|
|
|
|
[ODPAT_SET_TP_SRC] = 2,
|
|
|
|
[ODPAT_SET_TP_DST] = 2,
|
2010-12-10 10:42:42 -08:00
|
|
|
[ODPAT_SET_TUNNEL] = 8,
|
2010-12-10 10:40:58 -08:00
|
|
|
[ODPAT_SET_PRIORITY] = 4,
|
|
|
|
[ODPAT_POP_PRIORITY] = 0,
|
|
|
|
[ODPAT_DROP_SPOOFED_ARP] = 0,
|
|
|
|
};
|
|
|
|
int type = nla_type(a);
|
|
|
|
|
|
|
|
if (type > ODPAT_MAX || nla_len(a) != action_lens[type])
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case ODPAT_UNSPEC:
|
|
|
|
return -EINVAL;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
case ODPAT_CONTROLLER:
|
|
|
|
case ODPAT_STRIP_VLAN:
|
|
|
|
case ODPAT_SET_DL_SRC:
|
|
|
|
case ODPAT_SET_DL_DST:
|
|
|
|
case ODPAT_SET_NW_SRC:
|
|
|
|
case ODPAT_SET_NW_DST:
|
|
|
|
case ODPAT_SET_TP_SRC:
|
|
|
|
case ODPAT_SET_TP_DST:
|
|
|
|
case ODPAT_SET_TUNNEL:
|
|
|
|
case ODPAT_SET_PRIORITY:
|
|
|
|
case ODPAT_POP_PRIORITY:
|
|
|
|
case ODPAT_DROP_SPOOFED_ARP:
|
|
|
|
/* No validation needed. */
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODPAT_OUTPUT:
|
|
|
|
if (nla_get_u32(a) >= DP_MAX_PORTS)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
case ODPAT_SET_DL_TCI:
|
|
|
|
if (nla_get_be16(a) & htons(VLAN_CFI_MASK))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EINVAL;
|
2010-12-10 10:40:58 -08:00
|
|
|
break;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
case ODPAT_SET_NW_TOS:
|
|
|
|
if (nla_get_u8(a) & INET_ECN_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
break;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
default:
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
}
|
2010-03-12 16:05:25 -05:00
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
if (rem > 0)
|
|
|
|
return -EINVAL;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
return 0;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct sw_flow_actions *get_actions(const struct odp_flow *flow)
|
|
|
|
{
|
|
|
|
struct sw_flow_actions *actions;
|
|
|
|
int error;
|
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
actions = flow_actions_alloc(flow->actions_len);
|
2009-07-08 13:19:16 -07:00
|
|
|
error = PTR_ERR(actions);
|
|
|
|
if (IS_ERR(actions))
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
error = -EFAULT;
|
2010-12-10 10:40:58 -08:00
|
|
|
if (copy_from_user(actions->actions, flow->actions, flow->actions_len))
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error_free_actions;
|
2010-12-10 10:40:58 -08:00
|
|
|
error = validate_actions(actions->actions, actions->actions_len);
|
2009-07-08 13:19:16 -07:00
|
|
|
if (error)
|
|
|
|
goto error_free_actions;
|
|
|
|
|
|
|
|
return actions;
|
|
|
|
|
|
|
|
error_free_actions:
|
|
|
|
kfree(actions);
|
|
|
|
error:
|
|
|
|
return ERR_PTR(error);
|
|
|
|
}
|
|
|
|
|
2010-10-08 16:26:15 -07:00
|
|
|
static void get_stats(struct sw_flow *flow, struct odp_flow_stats *stats)
|
2010-07-15 19:22:07 -07:00
|
|
|
{
|
|
|
|
if (flow->used) {
|
2010-10-08 16:26:15 -07:00
|
|
|
struct timespec offset_ts, used, now_mono;
|
2010-07-15 19:22:07 -07:00
|
|
|
|
2010-10-08 16:26:15 -07:00
|
|
|
ktime_get_ts(&now_mono);
|
|
|
|
jiffies_to_timespec(jiffies - flow->used, &offset_ts);
|
|
|
|
set_normalized_timespec(&used, now_mono.tv_sec - offset_ts.tv_sec,
|
|
|
|
now_mono.tv_nsec - offset_ts.tv_nsec);
|
2010-07-15 19:22:07 -07:00
|
|
|
|
|
|
|
stats->used_sec = used.tv_sec;
|
|
|
|
stats->used_nsec = used.tv_nsec;
|
2009-07-08 13:19:16 -07:00
|
|
|
} else {
|
|
|
|
stats->used_sec = 0;
|
|
|
|
stats->used_nsec = 0;
|
|
|
|
}
|
2010-07-15 19:22:07 -07:00
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
stats->n_packets = flow->packet_count;
|
|
|
|
stats->n_bytes = flow->byte_count;
|
2010-07-27 10:02:07 -07:00
|
|
|
stats->reserved = 0;
|
2009-07-08 13:19:16 -07:00
|
|
|
stats->tcp_flags = flow->tcp_flags;
|
2009-06-17 12:41:30 -07:00
|
|
|
stats->error = 0;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void clear_stats(struct sw_flow *flow)
|
|
|
|
{
|
2010-07-15 19:22:07 -07:00
|
|
|
flow->used = 0;
|
2009-07-08 13:19:16 -07:00
|
|
|
flow->tcp_flags = 0;
|
|
|
|
flow->packet_count = 0;
|
|
|
|
flow->byte_count = 0;
|
|
|
|
}
|
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
static int expand_table(struct datapath *dp)
|
|
|
|
{
|
|
|
|
struct tbl *old_table = rcu_dereference(dp->table);
|
|
|
|
struct tbl *new_table;
|
|
|
|
|
|
|
|
new_table = tbl_expand(old_table);
|
|
|
|
if (IS_ERR(new_table))
|
|
|
|
return PTR_ERR(new_table);
|
|
|
|
|
|
|
|
rcu_assign_pointer(dp->table, new_table);
|
|
|
|
tbl_deferred_destroy(old_table, NULL);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
static int do_put_flow(struct datapath *dp, struct odp_flow_put *uf,
|
|
|
|
struct odp_flow_stats *stats)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-04-02 16:46:18 -04:00
|
|
|
struct tbl_node *flow_node;
|
2009-09-01 10:31:32 -07:00
|
|
|
struct sw_flow *flow;
|
2010-04-02 16:46:18 -04:00
|
|
|
struct tbl *table;
|
2009-07-08 13:19:16 -07:00
|
|
|
int error;
|
|
|
|
|
|
|
|
table = rcu_dereference(dp->table);
|
2010-05-13 13:21:33 -07:00
|
|
|
flow_node = tbl_lookup(table, &uf->flow.key, flow_hash(&uf->flow.key), flow_cmp);
|
2010-04-02 16:46:18 -04:00
|
|
|
if (!flow_node) {
|
2009-09-01 10:31:32 -07:00
|
|
|
/* No such flow. */
|
2009-07-08 13:19:16 -07:00
|
|
|
struct sw_flow_actions *acts;
|
|
|
|
|
|
|
|
error = -ENOENT;
|
2010-05-13 13:21:33 -07:00
|
|
|
if (!(uf->flags & ODPPF_CREATE))
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error;
|
|
|
|
|
|
|
|
/* Expand table, if necessary, to make room. */
|
2010-04-02 16:46:18 -04:00
|
|
|
if (tbl_count(table) >= tbl_n_buckets(table)) {
|
|
|
|
error = expand_table(dp);
|
2009-07-08 13:19:16 -07:00
|
|
|
if (error)
|
|
|
|
goto error;
|
2009-09-01 10:31:32 -07:00
|
|
|
table = rcu_dereference(dp->table);
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Allocate flow. */
|
2010-07-26 18:46:27 -07:00
|
|
|
flow = flow_alloc();
|
|
|
|
if (IS_ERR(flow)) {
|
|
|
|
error = PTR_ERR(flow);
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error;
|
2010-07-26 18:46:27 -07:00
|
|
|
}
|
2010-05-13 13:21:33 -07:00
|
|
|
flow->key = uf->flow.key;
|
2009-07-08 13:19:16 -07:00
|
|
|
clear_stats(flow);
|
|
|
|
|
|
|
|
/* Obtain actions. */
|
2010-05-13 13:21:33 -07:00
|
|
|
acts = get_actions(&uf->flow);
|
2009-07-08 13:19:16 -07:00
|
|
|
error = PTR_ERR(acts);
|
|
|
|
if (IS_ERR(acts))
|
|
|
|
goto error_free_flow;
|
|
|
|
rcu_assign_pointer(flow->sf_acts, acts);
|
|
|
|
|
|
|
|
/* Put flow in bucket. */
|
2010-04-02 16:46:18 -04:00
|
|
|
error = tbl_insert(table, &flow->tbl_node, flow_hash(&flow->key));
|
2009-09-01 10:31:32 -07:00
|
|
|
if (error)
|
|
|
|
goto error_free_flow_acts;
|
2010-04-02 16:46:18 -04:00
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
memset(stats, 0, sizeof(struct odp_flow_stats));
|
2009-07-08 13:19:16 -07:00
|
|
|
} else {
|
|
|
|
/* We found a matching flow. */
|
|
|
|
struct sw_flow_actions *old_acts, *new_acts;
|
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
flow = flow_cast(flow_node);
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
/* Bail out if we're not allowed to modify an existing flow. */
|
|
|
|
error = -EEXIST;
|
2010-05-13 13:21:33 -07:00
|
|
|
if (!(uf->flags & ODPPF_MODIFY))
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error;
|
|
|
|
|
|
|
|
/* Swap actions. */
|
2010-05-13 13:21:33 -07:00
|
|
|
new_acts = get_actions(&uf->flow);
|
2009-07-08 13:19:16 -07:00
|
|
|
error = PTR_ERR(new_acts);
|
|
|
|
if (IS_ERR(new_acts))
|
|
|
|
goto error;
|
|
|
|
old_acts = rcu_dereference(flow->sf_acts);
|
2010-12-10 10:40:58 -08:00
|
|
|
if (old_acts->actions_len != new_acts->actions_len ||
|
2009-07-08 13:19:16 -07:00
|
|
|
memcmp(old_acts->actions, new_acts->actions,
|
2010-12-10 10:40:58 -08:00
|
|
|
old_acts->actions_len)) {
|
2009-07-08 13:19:16 -07:00
|
|
|
rcu_assign_pointer(flow->sf_acts, new_acts);
|
|
|
|
flow_deferred_free_acts(old_acts);
|
|
|
|
} else {
|
|
|
|
kfree(new_acts);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Fetch stats, then clear them if necessary. */
|
2010-05-12 11:26:55 -07:00
|
|
|
spin_lock_bh(&flow->lock);
|
2010-10-08 16:26:15 -07:00
|
|
|
get_stats(flow, stats);
|
2010-05-13 13:21:33 -07:00
|
|
|
if (uf->flags & ODPPF_ZERO_STATS)
|
2009-07-08 13:19:16 -07:00
|
|
|
clear_stats(flow);
|
2010-05-12 11:26:55 -07:00
|
|
|
spin_unlock_bh(&flow->lock);
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2009-09-01 10:31:32 -07:00
|
|
|
error_free_flow_acts:
|
|
|
|
kfree(flow->sf_acts);
|
2009-07-08 13:19:16 -07:00
|
|
|
error_free_flow:
|
2010-08-29 09:49:51 -07:00
|
|
|
flow->sf_acts = NULL;
|
|
|
|
flow_put(flow);
|
2009-07-08 13:19:16 -07:00
|
|
|
error:
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
static int put_flow(struct datapath *dp, struct odp_flow_put __user *ufp)
|
|
|
|
{
|
|
|
|
struct odp_flow_stats stats;
|
|
|
|
struct odp_flow_put uf;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (copy_from_user(&uf, ufp, sizeof(struct odp_flow_put)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
error = do_put_flow(dp, &uf, &stats);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
2010-05-13 13:18:22 -07:00
|
|
|
if (copy_to_user(&ufp->flow.stats, &stats,
|
|
|
|
sizeof(struct odp_flow_stats)))
|
2010-05-13 13:21:33 -07:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int do_answer_query(struct sw_flow *flow, u32 query_flags,
|
|
|
|
struct odp_flow_stats __user *ustats,
|
2010-12-10 10:40:58 -08:00
|
|
|
struct nlattr __user *actions,
|
|
|
|
u32 __user *actions_lenp)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct sw_flow_actions *sf_acts;
|
2010-05-13 13:21:33 -07:00
|
|
|
struct odp_flow_stats stats;
|
2010-12-10 10:40:58 -08:00
|
|
|
u32 actions_len;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-05-12 11:26:55 -07:00
|
|
|
spin_lock_bh(&flow->lock);
|
2010-10-08 16:26:15 -07:00
|
|
|
get_stats(flow, &stats);
|
2010-05-12 11:26:55 -07:00
|
|
|
if (query_flags & ODPFF_ZERO_TCP_FLAGS)
|
2010-05-13 13:21:33 -07:00
|
|
|
flow->tcp_flags = 0;
|
2010-05-12 11:26:55 -07:00
|
|
|
|
|
|
|
spin_unlock_bh(&flow->lock);
|
2010-05-13 13:21:33 -07:00
|
|
|
|
2010-05-13 13:18:22 -07:00
|
|
|
if (copy_to_user(ustats, &stats, sizeof(struct odp_flow_stats)) ||
|
2010-12-10 10:40:58 -08:00
|
|
|
get_user(actions_len, actions_lenp))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EFAULT;
|
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
if (!actions_len)
|
2009-07-08 13:19:16 -07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
sf_acts = rcu_dereference(flow->sf_acts);
|
2010-12-10 10:40:58 -08:00
|
|
|
if (put_user(sf_acts->actions_len, actions_lenp) ||
|
2009-07-08 13:19:16 -07:00
|
|
|
(actions && copy_to_user(actions, sf_acts->actions,
|
2010-12-10 10:40:58 -08:00
|
|
|
min(sf_acts->actions_len, actions_len))))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-10-28 14:36:52 -07:00
|
|
|
static int answer_query(struct sw_flow *flow, u32 query_flags,
|
|
|
|
struct odp_flow __user *ufp)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-12-10 10:40:58 -08:00
|
|
|
struct nlattr *actions;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-05-13 13:18:22 -07:00
|
|
|
if (get_user(actions, &ufp->actions))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EFAULT;
|
2010-05-13 13:21:33 -07:00
|
|
|
|
2010-10-08 16:26:15 -07:00
|
|
|
return do_answer_query(flow, query_flags,
|
2010-12-10 10:40:58 -08:00
|
|
|
&ufp->stats, actions, &ufp->actions_len);
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
static struct sw_flow *do_del_flow(struct datapath *dp, struct odp_flow_key *key)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-04-02 16:46:18 -04:00
|
|
|
struct tbl *table = rcu_dereference(dp->table);
|
|
|
|
struct tbl_node *flow_node;
|
2009-07-08 13:19:16 -07:00
|
|
|
int error;
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
flow_node = tbl_lookup(table, key, flow_hash(key), flow_cmp);
|
2010-04-02 16:46:18 -04:00
|
|
|
if (!flow_node)
|
2010-05-13 13:21:33 -07:00
|
|
|
return ERR_PTR(-ENOENT);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
error = tbl_remove(table, flow_node);
|
2009-06-17 12:41:30 -07:00
|
|
|
if (error)
|
2010-05-13 13:21:33 -07:00
|
|
|
return ERR_PTR(error);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
/* XXX Returned flow_node's statistics might lose a few packets, since
|
|
|
|
* other CPUs can be using this flow. We used to synchronize_rcu() to
|
|
|
|
* make sure that we get completely accurate stats, but that blows our
|
|
|
|
* performance, badly. */
|
|
|
|
return flow_cast(flow_node);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int del_flow(struct datapath *dp, struct odp_flow __user *ufp)
|
|
|
|
{
|
|
|
|
struct sw_flow *flow;
|
|
|
|
struct odp_flow uf;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (copy_from_user(&uf, ufp, sizeof uf))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
flow = do_del_flow(dp, &uf.key);
|
|
|
|
if (IS_ERR(flow))
|
|
|
|
return PTR_ERR(flow);
|
2010-04-02 16:46:18 -04:00
|
|
|
|
2010-10-08 16:26:15 -07:00
|
|
|
error = answer_query(flow, 0, ufp);
|
2009-06-17 12:41:30 -07:00
|
|
|
flow_deferred_free(flow);
|
2009-07-08 13:19:16 -07:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
static int do_query_flows(struct datapath *dp, const struct odp_flowvec *flowvec)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-04-02 16:46:18 -04:00
|
|
|
struct tbl *table = rcu_dereference(dp->table);
|
2010-05-10 13:53:26 -07:00
|
|
|
u32 i;
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
for (i = 0; i < flowvec->n_flows; i++) {
|
2010-05-13 13:21:33 -07:00
|
|
|
struct odp_flow __user *ufp = &flowvec->flows[i];
|
2009-07-08 13:19:16 -07:00
|
|
|
struct odp_flow uf;
|
2010-04-02 16:46:18 -04:00
|
|
|
struct tbl_node *flow_node;
|
2009-07-08 13:19:16 -07:00
|
|
|
int error;
|
|
|
|
|
2010-05-13 13:18:22 -07:00
|
|
|
if (copy_from_user(&uf, ufp, sizeof uf))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EFAULT;
|
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
flow_node = tbl_lookup(table, &uf.key, flow_hash(&uf.key), flow_cmp);
|
|
|
|
if (!flow_node)
|
2010-05-13 13:18:22 -07:00
|
|
|
error = put_user(ENOENT, &ufp->stats.error);
|
2009-07-08 13:19:16 -07:00
|
|
|
else
|
2010-10-08 16:26:15 -07:00
|
|
|
error = answer_query(flow_cast(flow_node), uf.flags, ufp);
|
2009-07-08 13:19:16 -07:00
|
|
|
if (error)
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
return flowvec->n_flows;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct list_flows_cbdata {
|
|
|
|
struct odp_flow __user *uflows;
|
2010-05-10 13:53:26 -07:00
|
|
|
u32 n_flows;
|
|
|
|
u32 listed_flows;
|
2009-07-08 13:19:16 -07:00
|
|
|
};
|
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
static int list_flow(struct tbl_node *node, void *cbdata_)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-04-02 16:46:18 -04:00
|
|
|
struct sw_flow *flow = flow_cast(node);
|
2009-07-08 13:19:16 -07:00
|
|
|
struct list_flows_cbdata *cbdata = cbdata_;
|
|
|
|
struct odp_flow __user *ufp = &cbdata->uflows[cbdata->listed_flows++];
|
|
|
|
int error;
|
|
|
|
|
2010-05-13 13:18:22 -07:00
|
|
|
if (copy_to_user(&ufp->key, &flow->key, sizeof flow->key))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EFAULT;
|
2010-10-08 16:26:15 -07:00
|
|
|
error = answer_query(flow, 0, ufp);
|
2009-07-08 13:19:16 -07:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (cbdata->listed_flows >= cbdata->n_flows)
|
|
|
|
return cbdata->listed_flows;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
static int do_list_flows(struct datapath *dp, const struct odp_flowvec *flowvec)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct list_flows_cbdata cbdata;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (!flowvec->n_flows)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cbdata.uflows = flowvec->flows;
|
|
|
|
cbdata.n_flows = flowvec->n_flows;
|
|
|
|
cbdata.listed_flows = 0;
|
2010-07-15 19:22:07 -07:00
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
error = tbl_foreach(rcu_dereference(dp->table), list_flow, &cbdata);
|
2009-07-08 13:19:16 -07:00
|
|
|
return error ? error : cbdata.listed_flows;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int do_flowvec_ioctl(struct datapath *dp, unsigned long argp,
|
|
|
|
int (*function)(struct datapath *,
|
|
|
|
const struct odp_flowvec *))
|
|
|
|
{
|
|
|
|
struct odp_flowvec __user *uflowvec;
|
|
|
|
struct odp_flowvec flowvec;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
uflowvec = (struct odp_flowvec __user *)argp;
|
2010-05-13 13:18:22 -07:00
|
|
|
if (copy_from_user(&flowvec, uflowvec, sizeof flowvec))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
if (flowvec.n_flows > INT_MAX / sizeof(struct odp_flow))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
retval = function(dp, &flowvec);
|
|
|
|
return (retval < 0 ? retval
|
|
|
|
: retval == flowvec.n_flows ? 0
|
2010-05-13 13:18:22 -07:00
|
|
|
: put_user(retval, &uflowvec->n_flows));
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
static int do_execute(struct datapath *dp, const struct odp_execute *execute)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct odp_flow_key key;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
struct sw_flow_actions *actions;
|
2009-09-04 17:03:22 -07:00
|
|
|
struct ethhdr *eth;
|
2010-08-29 14:28:58 -07:00
|
|
|
bool is_frag;
|
2009-07-08 13:19:16 -07:00
|
|
|
int err;
|
|
|
|
|
|
|
|
err = -EINVAL;
|
2010-05-13 13:21:33 -07:00
|
|
|
if (execute->length < ETH_HLEN || execute->length > 65535)
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error;
|
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
actions = flow_actions_alloc(execute->actions_len);
|
2010-09-15 16:52:48 -07:00
|
|
|
if (IS_ERR(actions)) {
|
|
|
|
err = PTR_ERR(actions);
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error;
|
2010-09-15 16:52:48 -07:00
|
|
|
}
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
err = -EFAULT;
|
2010-12-10 10:40:58 -08:00
|
|
|
if (copy_from_user(actions->actions, execute->actions, execute->actions_len))
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error_free_actions;
|
|
|
|
|
2010-12-10 10:40:58 -08:00
|
|
|
err = validate_actions(actions->actions, execute->actions_len);
|
2009-07-08 13:19:16 -07:00
|
|
|
if (err)
|
|
|
|
goto error_free_actions;
|
|
|
|
|
|
|
|
err = -ENOMEM;
|
2010-05-13 13:21:33 -07:00
|
|
|
skb = alloc_skb(execute->length, GFP_KERNEL);
|
2009-07-08 13:19:16 -07:00
|
|
|
if (!skb)
|
|
|
|
goto error_free_actions;
|
2010-04-12 11:49:16 -04:00
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
err = -EFAULT;
|
2010-05-13 13:21:33 -07:00
|
|
|
if (copy_from_user(skb_put(skb, execute->length), execute->data,
|
|
|
|
execute->length))
|
2009-07-08 13:19:16 -07:00
|
|
|
goto error_free_skb;
|
|
|
|
|
2009-09-04 17:03:22 -07:00
|
|
|
skb_reset_mac_header(skb);
|
|
|
|
eth = eth_hdr(skb);
|
|
|
|
|
2009-11-16 15:24:35 -08:00
|
|
|
/* Normally, setting the skb 'protocol' field would be handled by a
|
|
|
|
* call to eth_type_trans(), but it assumes there's a sending
|
|
|
|
* device, which we may not have. */
|
2009-09-04 17:03:22 -07:00
|
|
|
if (ntohs(eth->h_proto) >= 1536)
|
|
|
|
skb->protocol = eth->h_proto;
|
|
|
|
else
|
|
|
|
skb->protocol = htons(ETH_P_802_2);
|
|
|
|
|
2010-10-08 16:36:13 -07:00
|
|
|
err = flow_extract(skb, -1, &key, &is_frag);
|
2010-08-27 12:32:05 -07:00
|
|
|
if (err)
|
|
|
|
goto error_free_skb;
|
2010-05-12 13:45:49 -07:00
|
|
|
|
|
|
|
rcu_read_lock();
|
2010-12-10 10:40:58 -08:00
|
|
|
err = execute_actions(dp, skb, &key, actions->actions, actions->actions_len);
|
2010-05-12 13:45:49 -07:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
kfree(actions);
|
|
|
|
return err;
|
|
|
|
|
|
|
|
error_free_skb:
|
|
|
|
kfree_skb(skb);
|
|
|
|
error_free_actions:
|
|
|
|
kfree(actions);
|
|
|
|
error:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2010-05-13 13:21:33 -07:00
|
|
|
static int execute_packet(struct datapath *dp, const struct odp_execute __user *executep)
|
|
|
|
{
|
|
|
|
struct odp_execute execute;
|
|
|
|
|
|
|
|
if (copy_from_user(&execute, executep, sizeof execute))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
return do_execute(dp, &execute);
|
|
|
|
}
|
|
|
|
|
2009-06-17 11:49:01 -07:00
|
|
|
static int get_dp_stats(struct datapath *dp, struct odp_stats __user *statsp)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-04-02 16:46:18 -04:00
|
|
|
struct tbl *table = rcu_dereference(dp->table);
|
2009-07-08 13:19:16 -07:00
|
|
|
struct odp_stats stats;
|
|
|
|
int i;
|
|
|
|
|
2010-04-02 16:46:18 -04:00
|
|
|
stats.n_flows = tbl_count(table);
|
|
|
|
stats.cur_capacity = tbl_n_buckets(table);
|
|
|
|
stats.max_capacity = TBL_MAX_BUCKETS;
|
2009-07-08 13:19:16 -07:00
|
|
|
stats.n_ports = dp->n_ports;
|
|
|
|
stats.max_ports = DP_MAX_PORTS;
|
|
|
|
stats.n_frags = stats.n_hit = stats.n_missed = stats.n_lost = 0;
|
|
|
|
for_each_possible_cpu(i) {
|
2010-07-28 18:20:43 -07:00
|
|
|
const struct dp_stats_percpu *percpu_stats;
|
|
|
|
struct dp_stats_percpu local_stats;
|
|
|
|
unsigned seqcount;
|
|
|
|
|
|
|
|
percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
|
|
|
|
|
|
|
|
do {
|
|
|
|
seqcount = read_seqcount_begin(&percpu_stats->seqlock);
|
|
|
|
local_stats = *percpu_stats;
|
|
|
|
} while (read_seqcount_retry(&percpu_stats->seqlock, seqcount));
|
|
|
|
|
|
|
|
stats.n_frags += local_stats.n_frags;
|
|
|
|
stats.n_hit += local_stats.n_hit;
|
|
|
|
stats.n_missed += local_stats.n_missed;
|
|
|
|
stats.n_lost += local_stats.n_lost;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
stats.max_miss_queue = DP_MAX_QUEUE_LEN;
|
|
|
|
stats.max_action_queue = DP_MAX_QUEUE_LEN;
|
|
|
|
return copy_to_user(statsp, &stats, sizeof stats) ? -EFAULT : 0;
|
|
|
|
}
|
|
|
|
|
2009-08-01 00:09:56 -07:00
|
|
|
/* MTU of the dp pseudo-device: ETH_DATA_LEN or the minimum of the ports */
|
|
|
|
int dp_min_mtu(const struct datapath *dp)
|
|
|
|
{
|
2010-12-03 13:09:26 -08:00
|
|
|
struct vport *p;
|
2009-08-01 00:09:56 -07:00
|
|
|
int mtu = 0;
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
|
|
|
|
|
|
|
list_for_each_entry_rcu (p, &dp->port_list, node) {
|
2010-04-12 15:53:39 -04:00
|
|
|
int dev_mtu;
|
2009-08-01 00:09:56 -07:00
|
|
|
|
|
|
|
/* Skip any internal ports, since that's what we're trying to
|
|
|
|
* set. */
|
2010-12-03 13:09:26 -08:00
|
|
|
if (is_internal_vport(p))
|
2009-08-01 00:09:56 -07:00
|
|
|
continue;
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
dev_mtu = vport_get_mtu(p);
|
2010-04-12 15:53:39 -04:00
|
|
|
if (!mtu || dev_mtu < mtu)
|
|
|
|
mtu = dev_mtu;
|
2009-08-01 00:09:56 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return mtu ? mtu : ETH_DATA_LEN;
|
|
|
|
}
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
/* Sets the MTU of all datapath devices to the minimum of the ports. Must
|
2010-04-26 18:08:54 -07:00
|
|
|
* be called with RTNL lock. */
|
2010-04-12 15:53:39 -04:00
|
|
|
void set_internal_devs_mtu(const struct datapath *dp)
|
2010-02-01 16:43:44 -05:00
|
|
|
{
|
2010-12-03 13:09:26 -08:00
|
|
|
struct vport *p;
|
2010-02-01 16:43:44 -05:00
|
|
|
int mtu;
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
|
|
|
|
|
|
|
mtu = dp_min_mtu(dp);
|
|
|
|
|
|
|
|
list_for_each_entry_rcu (p, &dp->port_list, node) {
|
2010-12-03 13:09:26 -08:00
|
|
|
if (is_internal_vport(p))
|
|
|
|
vport_set_mtu(p, mtu);
|
2010-02-01 16:43:44 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
static int put_port(const struct vport *p, struct odp_port __user *uop)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct odp_port op;
|
2010-04-12 15:53:39 -04:00
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
memset(&op, 0, sizeof op);
|
2010-04-12 15:53:39 -04:00
|
|
|
|
|
|
|
rcu_read_lock();
|
2010-12-03 13:09:26 -08:00
|
|
|
strncpy(op.devname, vport_get_name(p), sizeof op.devname);
|
|
|
|
strncpy(op.type, vport_get_type(p), sizeof op.type);
|
2010-04-12 15:53:39 -04:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
op.port = p->port_no;
|
2010-04-12 15:53:39 -04:00
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
return copy_to_user(uop, &op, sizeof op) ? -EFAULT : 0;
|
|
|
|
}
|
|
|
|
|
2010-07-14 19:27:18 -07:00
|
|
|
static int query_port(struct datapath *dp, struct odp_port __user *uport)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
struct odp_port port;
|
|
|
|
|
|
|
|
if (copy_from_user(&port, uport, sizeof port))
|
|
|
|
return -EFAULT;
|
2010-04-12 15:53:39 -04:00
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
if (port.devname[0]) {
|
2010-04-12 15:53:39 -04:00
|
|
|
struct vport *vport;
|
|
|
|
int err = 0;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
port.devname[IFNAMSIZ - 1] = '\0';
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
vport_lock();
|
|
|
|
rcu_read_lock();
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
vport = vport_locate(port.devname);
|
|
|
|
if (!vport) {
|
|
|
|
err = -ENODEV;
|
|
|
|
goto error_unlock;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
2010-12-03 13:09:26 -08:00
|
|
|
if (vport->dp != dp) {
|
2010-04-12 15:53:39 -04:00
|
|
|
err = -ENOENT;
|
|
|
|
goto error_unlock;
|
|
|
|
}
|
|
|
|
|
2010-12-03 13:09:26 -08:00
|
|
|
port.port = vport->port_no;
|
2010-04-12 15:53:39 -04:00
|
|
|
|
|
|
|
error_unlock:
|
|
|
|
rcu_read_unlock();
|
|
|
|
vport_unlock();
|
|
|
|
|
|
|
|
if (err)
|
|
|
|
return err;
|
2009-07-08 13:19:16 -07:00
|
|
|
} else {
|
|
|
|
if (port.port >= DP_MAX_PORTS)
|
|
|
|
return -EINVAL;
|
|
|
|
if (!dp->ports[port.port])
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
2010-04-12 15:53:39 -04:00
|
|
|
|
|
|
|
return put_port(dp->ports[port.port], uport);
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
2010-07-14 19:27:18 -07:00
|
|
|
static int do_list_ports(struct datapath *dp, struct odp_port __user *uports,
|
|
|
|
int n_ports)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2010-05-13 13:21:33 -07:00
|
|
|
int idx = 0;
|
|
|
|
if (n_ports) {
|
2010-12-03 13:09:26 -08:00
|
|
|
struct vport *p;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
list_for_each_entry_rcu (p, &dp->port_list, node) {
|
2010-05-13 13:21:33 -07:00
|
|
|
if (put_port(p, &uports[idx]))
|
2009-07-08 13:19:16 -07:00
|
|
|
return -EFAULT;
|
2010-05-13 13:21:33 -07:00
|
|
|
if (idx++ >= n_ports)
|
2009-07-08 13:19:16 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2010-05-13 13:21:33 -07:00
|
|
|
return idx;
|
|
|
|
}
|
|
|
|
|
2010-07-14 19:27:18 -07:00
|
|
|
static int list_ports(struct datapath *dp, struct odp_portvec __user *upv)
|
2010-05-13 13:21:33 -07:00
|
|
|
{
|
|
|
|
struct odp_portvec pv;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
if (copy_from_user(&pv, upv, sizeof pv))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
retval = do_list_ports(dp, pv.ports, pv.n_ports);
|
|
|
|
if (retval < 0)
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
return put_user(retval, &upv->n_ports);
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
2009-10-12 10:34:10 -07:00
|
|
|
static int get_listen_mask(const struct file *f)
|
|
|
|
{
|
|
|
|
return (long)f->private_data;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void set_listen_mask(struct file *f, int listen_mask)
|
|
|
|
{
|
|
|
|
f->private_data = (void*)(long)listen_mask;
|
|
|
|
}
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
static long openvswitch_ioctl(struct file *f, unsigned int cmd,
|
|
|
|
unsigned long argp)
|
|
|
|
{
|
|
|
|
int dp_idx = iminor(f->f_dentry->d_inode);
|
|
|
|
struct datapath *dp;
|
|
|
|
int drop_frags, listeners, port_no;
|
2010-01-04 13:08:37 -08:00
|
|
|
unsigned int sflow_probability;
|
2009-07-08 13:19:16 -07:00
|
|
|
int err;
|
|
|
|
|
|
|
|
/* Handle commands with special locking requirements up front. */
|
|
|
|
switch (cmd) {
|
|
|
|
case ODP_DP_CREATE:
|
2009-06-17 11:51:55 -07:00
|
|
|
err = create_dp(dp_idx, (char __user *)argp);
|
|
|
|
goto exit;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
case ODP_DP_DESTROY:
|
2009-06-17 11:51:55 -07:00
|
|
|
err = destroy_dp(dp_idx);
|
|
|
|
goto exit;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
case ODP_VPORT_ATTACH:
|
2010-04-12 15:53:39 -04:00
|
|
|
err = attach_port(dp_idx, (struct odp_port __user *)argp);
|
2009-06-17 11:51:55 -07:00
|
|
|
goto exit;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
case ODP_VPORT_DETACH:
|
2009-07-08 13:19:16 -07:00
|
|
|
err = get_user(port_no, (int __user *)argp);
|
2009-06-17 11:51:55 -07:00
|
|
|
if (!err)
|
2010-04-12 15:53:39 -04:00
|
|
|
err = detach_port(dp_idx, port_no);
|
|
|
|
goto exit;
|
|
|
|
|
|
|
|
case ODP_VPORT_MOD:
|
2010-12-03 14:41:38 -08:00
|
|
|
err = vport_user_mod((struct odp_port __user *)argp);
|
2010-04-12 15:53:39 -04:00
|
|
|
goto exit;
|
|
|
|
|
|
|
|
case ODP_VPORT_STATS_GET:
|
2010-05-28 18:09:27 -07:00
|
|
|
err = vport_user_stats_get((struct odp_vport_stats_req __user *)argp);
|
2010-04-12 15:53:39 -04:00
|
|
|
goto exit;
|
|
|
|
|
2010-05-19 17:18:08 -07:00
|
|
|
case ODP_VPORT_STATS_SET:
|
|
|
|
err = vport_user_stats_set((struct odp_vport_stats_req __user *)argp);
|
|
|
|
goto exit;
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
case ODP_VPORT_ETHER_GET:
|
2010-05-28 18:09:27 -07:00
|
|
|
err = vport_user_ether_get((struct odp_vport_ether __user *)argp);
|
2010-04-12 15:53:39 -04:00
|
|
|
goto exit;
|
|
|
|
|
|
|
|
case ODP_VPORT_ETHER_SET:
|
2010-05-28 18:09:27 -07:00
|
|
|
err = vport_user_ether_set((struct odp_vport_ether __user *)argp);
|
2010-04-12 15:53:39 -04:00
|
|
|
goto exit;
|
|
|
|
|
|
|
|
case ODP_VPORT_MTU_GET:
|
2010-05-28 18:09:27 -07:00
|
|
|
err = vport_user_mtu_get((struct odp_vport_mtu __user *)argp);
|
2010-04-12 15:53:39 -04:00
|
|
|
goto exit;
|
|
|
|
|
|
|
|
case ODP_VPORT_MTU_SET:
|
2010-05-28 18:09:27 -07:00
|
|
|
err = vport_user_mtu_set((struct odp_vport_mtu __user *)argp);
|
2009-06-17 11:51:55 -07:00
|
|
|
goto exit;
|
2009-07-08 13:19:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
dp = get_dp_locked(dp_idx);
|
2009-06-17 11:51:55 -07:00
|
|
|
err = -ENODEV;
|
2009-07-08 13:19:16 -07:00
|
|
|
if (!dp)
|
2009-06-17 11:51:55 -07:00
|
|
|
goto exit;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case ODP_DP_STATS:
|
|
|
|
err = get_dp_stats(dp, (struct odp_stats __user *)argp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_GET_DROP_FRAGS:
|
|
|
|
err = put_user(dp->drop_frags, (int __user *)argp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_SET_DROP_FRAGS:
|
|
|
|
err = get_user(drop_frags, (int __user *)argp);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
err = -EINVAL;
|
|
|
|
if (drop_frags != 0 && drop_frags != 1)
|
|
|
|
break;
|
|
|
|
dp->drop_frags = drop_frags;
|
|
|
|
err = 0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_GET_LISTEN_MASK:
|
2009-10-12 10:34:10 -07:00
|
|
|
err = put_user(get_listen_mask(f), (int __user *)argp);
|
2009-07-08 13:19:16 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_SET_LISTEN_MASK:
|
|
|
|
err = get_user(listeners, (int __user *)argp);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
err = -EINVAL;
|
|
|
|
if (listeners & ~ODPL_ALL)
|
|
|
|
break;
|
|
|
|
err = 0;
|
2009-10-12 10:34:10 -07:00
|
|
|
set_listen_mask(f, listeners);
|
2009-07-08 13:19:16 -07:00
|
|
|
break;
|
|
|
|
|
2010-01-04 13:08:37 -08:00
|
|
|
case ODP_GET_SFLOW_PROBABILITY:
|
|
|
|
err = put_user(dp->sflow_probability, (unsigned int __user *)argp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_SET_SFLOW_PROBABILITY:
|
|
|
|
err = get_user(sflow_probability, (unsigned int __user *)argp);
|
|
|
|
if (!err)
|
|
|
|
dp->sflow_probability = sflow_probability;
|
|
|
|
break;
|
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
case ODP_VPORT_QUERY:
|
2009-07-08 13:19:16 -07:00
|
|
|
err = query_port(dp, (struct odp_port __user *)argp);
|
|
|
|
break;
|
|
|
|
|
2010-12-03 14:41:38 -08:00
|
|
|
case ODP_VPORT_LIST:
|
2009-07-08 13:19:16 -07:00
|
|
|
err = list_ports(dp, (struct odp_portvec __user *)argp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_FLUSH:
|
|
|
|
err = flush_flows(dp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_PUT:
|
|
|
|
err = put_flow(dp, (struct odp_flow_put __user *)argp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_DEL:
|
2009-06-17 12:41:30 -07:00
|
|
|
err = del_flow(dp, (struct odp_flow __user *)argp);
|
2009-07-08 13:19:16 -07:00
|
|
|
break;
|
|
|
|
|
2009-06-17 12:41:30 -07:00
|
|
|
case ODP_FLOW_GET:
|
2010-05-13 13:21:33 -07:00
|
|
|
err = do_flowvec_ioctl(dp, argp, do_query_flows);
|
2009-07-08 13:19:16 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_LIST:
|
2010-05-13 13:21:33 -07:00
|
|
|
err = do_flowvec_ioctl(dp, argp, do_list_flows);
|
2009-07-08 13:19:16 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_EXECUTE:
|
2010-05-13 13:21:33 -07:00
|
|
|
err = execute_packet(dp, (struct odp_execute __user *)argp);
|
2009-07-08 13:19:16 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
err = -ENOIOCTLCMD;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_unlock(&dp->mutex);
|
2009-06-17 11:51:55 -07:00
|
|
|
exit:
|
2009-07-08 13:19:16 -07:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int dp_has_packet_of_interest(struct datapath *dp, int listeners)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < DP_N_QUEUES; i++) {
|
|
|
|
if (listeners & (1 << i) && !skb_queue_empty(&dp->queues[i]))
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-13 15:25:27 -07:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
static int compat_list_ports(struct datapath *dp, struct compat_odp_portvec __user *upv)
|
|
|
|
{
|
|
|
|
struct compat_odp_portvec pv;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
if (copy_from_user(&pv, upv, sizeof pv))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
retval = do_list_ports(dp, compat_ptr(pv.ports), pv.n_ports);
|
|
|
|
if (retval < 0)
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
return put_user(retval, &upv->n_ports);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_get_flow(struct odp_flow *flow, const struct compat_odp_flow __user *compat)
|
|
|
|
{
|
|
|
|
compat_uptr_t actions;
|
|
|
|
|
|
|
|
if (!access_ok(VERIFY_READ, compat, sizeof(struct compat_odp_flow)) ||
|
|
|
|
__copy_from_user(&flow->stats, &compat->stats, sizeof(struct odp_flow_stats)) ||
|
|
|
|
__copy_from_user(&flow->key, &compat->key, sizeof(struct odp_flow_key)) ||
|
|
|
|
__get_user(actions, &compat->actions) ||
|
2010-12-10 10:40:58 -08:00
|
|
|
__get_user(flow->actions_len, &compat->actions_len) ||
|
2010-05-13 15:25:27 -07:00
|
|
|
__get_user(flow->flags, &compat->flags))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
flow->actions = compat_ptr(actions);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_put_flow(struct datapath *dp, struct compat_odp_flow_put __user *ufp)
|
|
|
|
{
|
|
|
|
struct odp_flow_stats stats;
|
|
|
|
struct odp_flow_put fp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (compat_get_flow(&fp.flow, &ufp->flow) ||
|
|
|
|
get_user(fp.flags, &ufp->flags))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
error = do_put_flow(dp, &fp, &stats);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (copy_to_user(&ufp->flow.stats, &stats,
|
|
|
|
sizeof(struct odp_flow_stats)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_answer_query(struct sw_flow *flow, u32 query_flags,
|
|
|
|
struct compat_odp_flow __user *ufp)
|
|
|
|
{
|
|
|
|
compat_uptr_t actions;
|
|
|
|
|
|
|
|
if (get_user(actions, &ufp->actions))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2010-10-08 16:26:15 -07:00
|
|
|
return do_answer_query(flow, query_flags, &ufp->stats,
|
2010-12-10 10:40:58 -08:00
|
|
|
compat_ptr(actions), &ufp->actions_len);
|
2010-05-13 15:25:27 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_del_flow(struct datapath *dp, struct compat_odp_flow __user *ufp)
|
|
|
|
{
|
|
|
|
struct sw_flow *flow;
|
|
|
|
struct odp_flow uf;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (compat_get_flow(&uf, ufp))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
flow = do_del_flow(dp, &uf.key);
|
|
|
|
if (IS_ERR(flow))
|
|
|
|
return PTR_ERR(flow);
|
|
|
|
|
2010-10-08 16:26:15 -07:00
|
|
|
error = compat_answer_query(flow, 0, ufp);
|
2010-05-13 15:25:27 -07:00
|
|
|
flow_deferred_free(flow);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_query_flows(struct datapath *dp, struct compat_odp_flow *flows, u32 n_flows)
|
|
|
|
{
|
|
|
|
struct tbl *table = rcu_dereference(dp->table);
|
|
|
|
u32 i;
|
|
|
|
|
|
|
|
for (i = 0; i < n_flows; i++) {
|
|
|
|
struct compat_odp_flow __user *ufp = &flows[i];
|
|
|
|
struct odp_flow uf;
|
|
|
|
struct tbl_node *flow_node;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (compat_get_flow(&uf, ufp))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
flow_node = tbl_lookup(table, &uf.key, flow_hash(&uf.key), flow_cmp);
|
|
|
|
if (!flow_node)
|
|
|
|
error = put_user(ENOENT, &ufp->stats.error);
|
|
|
|
else
|
2010-10-08 16:26:15 -07:00
|
|
|
error = compat_answer_query(flow_cast(flow_node), uf.flags, ufp);
|
2010-05-13 15:25:27 -07:00
|
|
|
if (error)
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
return n_flows;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct compat_list_flows_cbdata {
|
|
|
|
struct compat_odp_flow __user *uflows;
|
|
|
|
u32 n_flows;
|
|
|
|
u32 listed_flows;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int compat_list_flow(struct tbl_node *node, void *cbdata_)
|
|
|
|
{
|
|
|
|
struct sw_flow *flow = flow_cast(node);
|
|
|
|
struct compat_list_flows_cbdata *cbdata = cbdata_;
|
|
|
|
struct compat_odp_flow __user *ufp = &cbdata->uflows[cbdata->listed_flows++];
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (copy_to_user(&ufp->key, &flow->key, sizeof flow->key))
|
|
|
|
return -EFAULT;
|
2010-10-08 16:26:15 -07:00
|
|
|
error = compat_answer_query(flow, 0, ufp);
|
2010-05-13 15:25:27 -07:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (cbdata->listed_flows >= cbdata->n_flows)
|
|
|
|
return cbdata->listed_flows;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_list_flows(struct datapath *dp, struct compat_odp_flow *flows, u32 n_flows)
|
|
|
|
{
|
|
|
|
struct compat_list_flows_cbdata cbdata;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (!n_flows)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cbdata.uflows = flows;
|
|
|
|
cbdata.n_flows = n_flows;
|
|
|
|
cbdata.listed_flows = 0;
|
2010-07-15 19:22:07 -07:00
|
|
|
|
2010-05-13 15:25:27 -07:00
|
|
|
error = tbl_foreach(rcu_dereference(dp->table), compat_list_flow, &cbdata);
|
|
|
|
return error ? error : cbdata.listed_flows;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_flowvec_ioctl(struct datapath *dp, unsigned long argp,
|
|
|
|
int (*function)(struct datapath *,
|
|
|
|
struct compat_odp_flow *,
|
|
|
|
u32 n_flows))
|
|
|
|
{
|
|
|
|
struct compat_odp_flowvec __user *uflowvec;
|
|
|
|
struct compat_odp_flow __user *flows;
|
|
|
|
struct compat_odp_flowvec flowvec;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
uflowvec = compat_ptr(argp);
|
|
|
|
if (!access_ok(VERIFY_WRITE, uflowvec, sizeof *uflowvec) ||
|
|
|
|
copy_from_user(&flowvec, uflowvec, sizeof flowvec))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
if (flowvec.n_flows > INT_MAX / sizeof(struct compat_odp_flow))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
flows = compat_ptr(flowvec.flows);
|
|
|
|
if (!access_ok(VERIFY_WRITE, flows,
|
|
|
|
flowvec.n_flows * sizeof(struct compat_odp_flow)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
retval = function(dp, flows, flowvec.n_flows);
|
|
|
|
return (retval < 0 ? retval
|
|
|
|
: retval == flowvec.n_flows ? 0
|
|
|
|
: put_user(retval, &uflowvec->n_flows));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compat_execute(struct datapath *dp, const struct compat_odp_execute __user *uexecute)
|
|
|
|
{
|
|
|
|
struct odp_execute execute;
|
|
|
|
compat_uptr_t actions;
|
|
|
|
compat_uptr_t data;
|
|
|
|
|
|
|
|
if (!access_ok(VERIFY_READ, uexecute, sizeof(struct compat_odp_execute)) ||
|
|
|
|
__get_user(actions, &uexecute->actions) ||
|
2010-12-10 10:40:58 -08:00
|
|
|
__get_user(execute.actions_len, &uexecute->actions_len) ||
|
2010-05-13 15:25:27 -07:00
|
|
|
__get_user(data, &uexecute->data) ||
|
|
|
|
__get_user(execute.length, &uexecute->length))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
execute.actions = compat_ptr(actions);
|
|
|
|
execute.data = compat_ptr(data);
|
|
|
|
|
|
|
|
return do_execute(dp, &execute);
|
|
|
|
}
|
|
|
|
|
|
|
|
static long openvswitch_compat_ioctl(struct file *f, unsigned int cmd, unsigned long argp)
|
|
|
|
{
|
|
|
|
int dp_idx = iminor(f->f_dentry->d_inode);
|
|
|
|
struct datapath *dp;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case ODP_DP_DESTROY:
|
|
|
|
case ODP_FLOW_FLUSH:
|
|
|
|
/* Ioctls that don't need any translation at all. */
|
|
|
|
return openvswitch_ioctl(f, cmd, argp);
|
|
|
|
|
|
|
|
case ODP_DP_CREATE:
|
2010-12-03 14:41:38 -08:00
|
|
|
case ODP_VPORT_ATTACH:
|
|
|
|
case ODP_VPORT_DETACH:
|
|
|
|
case ODP_VPORT_MOD:
|
2010-05-13 15:25:27 -07:00
|
|
|
case ODP_VPORT_MTU_SET:
|
|
|
|
case ODP_VPORT_MTU_GET:
|
|
|
|
case ODP_VPORT_ETHER_SET:
|
|
|
|
case ODP_VPORT_ETHER_GET:
|
2010-05-19 17:18:08 -07:00
|
|
|
case ODP_VPORT_STATS_SET:
|
2010-05-13 15:25:27 -07:00
|
|
|
case ODP_VPORT_STATS_GET:
|
|
|
|
case ODP_DP_STATS:
|
|
|
|
case ODP_GET_DROP_FRAGS:
|
|
|
|
case ODP_SET_DROP_FRAGS:
|
|
|
|
case ODP_SET_LISTEN_MASK:
|
|
|
|
case ODP_GET_LISTEN_MASK:
|
|
|
|
case ODP_SET_SFLOW_PROBABILITY:
|
|
|
|
case ODP_GET_SFLOW_PROBABILITY:
|
2010-12-03 14:41:38 -08:00
|
|
|
case ODP_VPORT_QUERY:
|
2010-05-13 15:25:27 -07:00
|
|
|
/* Ioctls that just need their pointer argument extended. */
|
|
|
|
return openvswitch_ioctl(f, cmd, (unsigned long)compat_ptr(argp));
|
|
|
|
}
|
|
|
|
|
|
|
|
dp = get_dp_locked(dp_idx);
|
|
|
|
err = -ENODEV;
|
|
|
|
if (!dp)
|
|
|
|
goto exit;
|
|
|
|
|
|
|
|
switch (cmd) {
|
2010-12-03 14:41:38 -08:00
|
|
|
case ODP_VPORT_LIST32:
|
2010-05-13 15:25:27 -07:00
|
|
|
err = compat_list_ports(dp, compat_ptr(argp));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_PUT32:
|
|
|
|
err = compat_put_flow(dp, compat_ptr(argp));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_DEL32:
|
|
|
|
err = compat_del_flow(dp, compat_ptr(argp));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_GET32:
|
|
|
|
err = compat_flowvec_ioctl(dp, argp, compat_query_flows);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_FLOW_LIST32:
|
|
|
|
err = compat_flowvec_ioctl(dp, argp, compat_list_flows);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ODP_EXECUTE32:
|
|
|
|
err = compat_execute(dp, compat_ptr(argp));
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
err = -ENOIOCTLCMD;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_unlock(&dp->mutex);
|
|
|
|
exit:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2010-06-17 15:20:16 -07:00
|
|
|
/* Unfortunately this function is not exported so this is a verbatim copy
|
|
|
|
* from net/core/datagram.c in 2.6.30. */
|
|
|
|
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
|
|
|
|
u8 __user *to, int len,
|
|
|
|
__wsum *csump)
|
|
|
|
{
|
|
|
|
int start = skb_headlen(skb);
|
|
|
|
int pos = 0;
|
|
|
|
int i, copy = start - offset;
|
|
|
|
|
|
|
|
/* Copy header. */
|
|
|
|
if (copy > 0) {
|
|
|
|
int err = 0;
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
|
|
|
|
*csump, &err);
|
|
|
|
if (err)
|
|
|
|
goto fault;
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
pos = copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
|
|
|
int end;
|
|
|
|
|
|
|
|
WARN_ON(start > offset + len);
|
|
|
|
|
|
|
|
end = start + skb_shinfo(skb)->frags[i].size;
|
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
__wsum csum2;
|
|
|
|
int err = 0;
|
|
|
|
u8 *vaddr;
|
|
|
|
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
|
|
|
struct page *page = frag->page;
|
|
|
|
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
vaddr = kmap(page);
|
|
|
|
csum2 = csum_and_copy_to_user(vaddr +
|
|
|
|
frag->page_offset +
|
|
|
|
offset - start,
|
|
|
|
to, copy, 0, &err);
|
|
|
|
kunmap(page);
|
|
|
|
if (err)
|
|
|
|
goto fault;
|
|
|
|
*csump = csum_block_add(*csump, csum2, pos);
|
|
|
|
if (!(len -= copy))
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
pos += copy;
|
|
|
|
}
|
|
|
|
start = end;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (skb_shinfo(skb)->frag_list) {
|
|
|
|
struct sk_buff *list = skb_shinfo(skb)->frag_list;
|
|
|
|
|
|
|
|
for (; list; list=list->next) {
|
|
|
|
int end;
|
|
|
|
|
|
|
|
WARN_ON(start > offset + len);
|
|
|
|
|
|
|
|
end = start + list->len;
|
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
__wsum csum2 = 0;
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
if (skb_copy_and_csum_datagram(list,
|
|
|
|
offset - start,
|
|
|
|
to, copy,
|
|
|
|
&csum2))
|
|
|
|
goto fault;
|
|
|
|
*csump = csum_block_add(*csump, csum2, pos);
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
pos += copy;
|
|
|
|
}
|
|
|
|
start = end;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!len)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fault:
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
2010-12-04 13:52:25 -08:00
|
|
|
static ssize_t openvswitch_read(struct file *f, char __user *buf,
|
|
|
|
size_t nbytes, loff_t *ppos)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
2009-10-12 10:34:10 -07:00
|
|
|
int listeners = get_listen_mask(f);
|
2009-07-08 13:19:16 -07:00
|
|
|
int dp_idx = iminor(f->f_dentry->d_inode);
|
2010-12-08 13:19:05 -08:00
|
|
|
struct datapath *dp = get_dp_locked(dp_idx);
|
2009-07-08 13:19:16 -07:00
|
|
|
struct sk_buff *skb;
|
2010-06-17 15:20:16 -07:00
|
|
|
size_t copy_bytes, tot_copy_bytes;
|
2009-07-08 13:19:16 -07:00
|
|
|
int retval;
|
|
|
|
|
|
|
|
if (!dp)
|
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
if (nbytes == 0 || !listeners)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < DP_N_QUEUES; i++) {
|
|
|
|
if (listeners & (1 << i)) {
|
|
|
|
skb = skb_dequeue(&dp->queues[i]);
|
|
|
|
if (skb)
|
|
|
|
goto success;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (f->f_flags & O_NONBLOCK) {
|
|
|
|
retval = -EAGAIN;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
wait_event_interruptible(dp->waitqueue,
|
|
|
|
dp_has_packet_of_interest(dp,
|
|
|
|
listeners));
|
|
|
|
|
|
|
|
if (signal_pending(current)) {
|
|
|
|
retval = -ERESTARTSYS;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
success:
|
2010-12-08 13:19:05 -08:00
|
|
|
mutex_unlock(&dp->mutex);
|
|
|
|
|
2010-06-17 15:20:16 -07:00
|
|
|
copy_bytes = tot_copy_bytes = min_t(size_t, skb->len, nbytes);
|
2010-08-30 00:24:53 -07:00
|
|
|
|
2010-06-17 15:20:16 -07:00
|
|
|
retval = 0;
|
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL) {
|
2010-06-18 17:11:44 -07:00
|
|
|
if (copy_bytes == skb->len) {
|
|
|
|
__wsum csum = 0;
|
2010-11-22 14:17:24 -08:00
|
|
|
u16 csum_start, csum_offset;
|
|
|
|
|
|
|
|
get_skb_csum_pointers(skb, &csum_start, &csum_offset);
|
2010-07-12 16:02:12 -07:00
|
|
|
BUG_ON(csum_start >= skb_headlen(skb));
|
2010-06-17 15:20:16 -07:00
|
|
|
retval = skb_copy_and_csum_datagram(skb, csum_start, buf + csum_start,
|
|
|
|
copy_bytes - csum_start, &csum);
|
|
|
|
if (!retval) {
|
|
|
|
__sum16 __user *csump;
|
|
|
|
|
|
|
|
copy_bytes = csum_start;
|
|
|
|
csump = (__sum16 __user *)(buf + csum_start + csum_offset);
|
2010-07-12 16:02:12 -07:00
|
|
|
|
|
|
|
BUG_ON((char *)csump + sizeof(__sum16) > buf + nbytes);
|
2010-06-17 15:20:16 -07:00
|
|
|
put_user(csum_fold(csum), csump);
|
|
|
|
}
|
2010-06-18 17:11:44 -07:00
|
|
|
} else
|
|
|
|
retval = skb_checksum_help(skb);
|
2010-06-17 15:20:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!retval) {
|
|
|
|
struct iovec __user iov;
|
|
|
|
|
|
|
|
iov.iov_base = buf;
|
|
|
|
iov.iov_len = copy_bytes;
|
|
|
|
retval = skb_copy_datagram_iovec(skb, 0, &iov, iov.iov_len);
|
|
|
|
}
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
if (!retval)
|
2010-06-17 15:20:16 -07:00
|
|
|
retval = tot_copy_bytes;
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
kfree_skb(skb);
|
2010-12-08 13:19:05 -08:00
|
|
|
return retval;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
error:
|
2010-12-08 13:19:05 -08:00
|
|
|
mutex_unlock(&dp->mutex);
|
2009-07-08 13:19:16 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned int openvswitch_poll(struct file *file, poll_table *wait)
|
|
|
|
{
|
|
|
|
int dp_idx = iminor(file->f_dentry->d_inode);
|
2010-12-08 13:19:05 -08:00
|
|
|
struct datapath *dp = get_dp_locked(dp_idx);
|
2009-07-08 13:19:16 -07:00
|
|
|
unsigned int mask;
|
|
|
|
|
|
|
|
if (dp) {
|
|
|
|
mask = 0;
|
|
|
|
poll_wait(file, &dp->waitqueue, wait);
|
2009-10-12 10:34:10 -07:00
|
|
|
if (dp_has_packet_of_interest(dp, get_listen_mask(file)))
|
2009-07-08 13:19:16 -07:00
|
|
|
mask |= POLLIN | POLLRDNORM;
|
2010-12-08 13:19:05 -08:00
|
|
|
mutex_unlock(&dp->mutex);
|
2009-07-08 13:19:16 -07:00
|
|
|
} else {
|
|
|
|
mask = POLLIN | POLLRDNORM | POLLHUP;
|
|
|
|
}
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
2010-12-04 13:52:25 -08:00
|
|
|
static struct file_operations openvswitch_fops = {
|
2009-07-08 13:19:16 -07:00
|
|
|
.read = openvswitch_read,
|
|
|
|
.poll = openvswitch_poll,
|
|
|
|
.unlocked_ioctl = openvswitch_ioctl,
|
2010-05-13 15:25:27 -07:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
.compat_ioctl = openvswitch_compat_ioctl,
|
|
|
|
#endif
|
2009-07-08 13:19:16 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
static int major;
|
2009-09-11 15:49:37 -07:00
|
|
|
|
|
|
|
static int __init dp_init(void)
|
|
|
|
{
|
2010-04-12 15:53:39 -04:00
|
|
|
struct sk_buff *dummy_skb;
|
2009-09-11 15:49:37 -07:00
|
|
|
int err;
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb));
|
2009-09-11 15:49:37 -07:00
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
printk("Open vSwitch %s, built "__DATE__" "__TIME__"\n", VERSION BUILDNR);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
err = flow_init();
|
|
|
|
if (err)
|
|
|
|
goto error;
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
err = vport_init();
|
2009-07-08 13:19:16 -07:00
|
|
|
if (err)
|
|
|
|
goto error_flow_exit;
|
|
|
|
|
2010-04-12 15:53:39 -04:00
|
|
|
err = register_netdevice_notifier(&dp_device_notifier);
|
|
|
|
if (err)
|
|
|
|
goto error_vport_exit;
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
major = register_chrdev(0, "openvswitch", &openvswitch_fops);
|
|
|
|
if (err < 0)
|
|
|
|
goto error_unreg_notifier;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error_unreg_notifier:
|
|
|
|
unregister_netdevice_notifier(&dp_device_notifier);
|
2010-04-12 15:53:39 -04:00
|
|
|
error_vport_exit:
|
|
|
|
vport_exit();
|
2009-07-08 13:19:16 -07:00
|
|
|
error_flow_exit:
|
|
|
|
flow_exit();
|
|
|
|
error:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dp_cleanup(void)
|
|
|
|
{
|
|
|
|
rcu_barrier();
|
|
|
|
unregister_chrdev(major, "openvswitch");
|
|
|
|
unregister_netdevice_notifier(&dp_device_notifier);
|
2010-04-12 15:53:39 -04:00
|
|
|
vport_exit();
|
2009-07-08 13:19:16 -07:00
|
|
|
flow_exit();
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(dp_init);
|
|
|
|
module_exit(dp_cleanup);
|
|
|
|
|
|
|
|
MODULE_DESCRIPTION("Open vSwitch switching datapath");
|
|
|
|
MODULE_LICENSE("GPL");
|