2
0
mirror of https://github.com/openvswitch/ovs synced 2025-09-06 17:26:27 +00:00
Files
ovs/datapath/datapath.c

2187 lines
52 KiB
C
Raw Normal View History

/*
* Copyright (c) 2007-2014 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/time.h>
#include <linux/etherdevice.h>
#include <linux/genetlink.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/version.h>
#include <linux/ethtool.h>
#include <linux/wait.h>
#include <asm/div64.h>
#include <linux/highmem.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/inetdevice.h>
#include <linux/list.h>
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include "datapath.h"
#include "flow.h"
#include "flow_table.h"
#include "flow_netlink.h"
#include "vlan.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
int ovs_net_id __read_mostly;
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
static struct genl_family dp_datapath_genl_family;
static struct genl_multicast_group ovs_dp_flow_multicast_group = {
.name = OVS_FLOW_MCGROUP
};
static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
.name = OVS_DATAPATH_MCGROUP
};
struct genl_multicast_group ovs_dp_vport_multicast_group = {
.name = OVS_VPORT_MCGROUP
};
/* Check if need to build a reply message.
* OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */
static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
unsigned int group)
{
return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
genl_has_listeners(family, genl_info_net(info)->genl_sock,
group);
}
static void ovs_notify(struct genl_family *family, struct genl_multicast_group *grp,
struct sk_buff *skb, struct genl_info *info)
{
genl_notify(family, skb, genl_info_net(info),
info->snd_portid, GROUP_ID(grp), info->nlhdr, GFP_KERNEL);
}
/**
* DOC: Locking:
*
* All writes e.g. Writes to device state (add/remove datapath, port, set
* operations on vports, etc.), Writes to other state (flow table
* modifications, set miscellaneous datapath parameters, etc.) are protected
* by ovs_lock.
*
* Reads are protected by RCU.
*
* There are a few special cases (mostly stats) that have their own
* synchronization but they nest under all of above and don't interact with
* each other.
*
* The RTNL lock nests inside ovs_mutex.
*/
static DEFINE_MUTEX(ovs_mutex);
void ovs_lock(void)
{
mutex_lock(&ovs_mutex);
}
void ovs_unlock(void)
{
mutex_unlock(&ovs_mutex);
}
#ifdef CONFIG_LOCKDEP
int lockdep_ovsl_is_held(void)
{
if (debug_locks)
return lockdep_is_held(&ovs_mutex);
else
return 1;
}
#endif
static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
struct sw_flow_key *, const struct dp_upcall_info *);
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
struct sw_flow_key *key,
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
const struct dp_upcall_info *);
/* Must be called with rcu_read_lock. */
static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
{
struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
if (dev) {
struct vport *vport = ovs_internal_dev_get_vport(dev);
if (vport)
return vport->dp;
}
return NULL;
}
/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
* returned dp pointer valid. */
static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
{
struct datapath *dp;
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
rcu_read_lock();
dp = get_dp_rcu(net, dp_ifindex);
rcu_read_unlock();
return dp;
}
/* Must be called with rcu_read_lock or ovs_mutex. */
const char *ovs_dp_name(const struct datapath *dp)
{
struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
return vport->ops->get_name(vport);
}
static int get_dpifindex(struct datapath *dp)
{
struct vport *local;
int ifindex;
rcu_read_lock();
local = ovs_vport_rcu(dp, OVSP_LOCAL);
if (local)
ifindex = netdev_vport_priv(local)->dev->ifindex;
else
ifindex = 0;
rcu_read_unlock();
return ifindex;
}
static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
ovs_flow_tbl_destroy(&dp->table);
free_percpu(dp->stats_percpu);
release_net(ovs_dp_get_net(dp));
kfree(dp->ports);
kfree(dp);
}
static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
u16 port_no)
{
return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
}
/* Called with ovs_mutex or RCU read lock. */
struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
{
struct vport *vport;
struct hlist_head *head;
head = vport_hash_bucket(dp, port_no);
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
if (vport->port_no == port_no)
return vport;
}
return NULL;
}
/* Called with ovs_mutex. */
static struct vport *new_vport(const struct vport_parms *parms)
{
struct vport *vport;
vport = ovs_vport_add(parms);
if (!IS_ERR(vport)) {
struct datapath *dp = parms->dp;
struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
hlist_add_head_rcu(&vport->dp_hash_node, head);
}
return vport;
}
void ovs_dp_detach_port(struct vport *p)
{
ASSERT_OVSL();
/* First drop references to device. */
hlist_del_rcu(&p->dp_hash_node);
/* Then destroy it. */
ovs_vport_del(p);
}
/* Must be called with rcu_read_lock. */
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
const struct vport *p = OVS_CB(skb)->input_vport;
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct dp_stats_percpu *stats;
u64 *stats_counter;
u32 n_mask_hit;
stats = this_cpu_ptr(dp->stats_percpu);
/* Look up flow. */
flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
&n_mask_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
int error;
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.userdata = NULL;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
upcall.egress_tun_info = NULL;
error = ovs_dp_upcall(dp, skb, key, &upcall);
if (unlikely(error))
kfree_skb(skb);
else
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
ovs_flow_stats_update(flow, key->tp.flags, skb);
sf_acts = rcu_dereference(flow->sf_acts);
ovs_execute_actions(dp, skb, key, sf_acts);
stats_counter = &stats->n_hit;
datapath: Detect and suppress flows that are implicated in loops. In-kernel loops need to be suppressed; otherwise, they cause high CPU consumption, even to the point that the machine becomes unusable. Ideally these flows should never be added to the Open vSwitch flow table, but it is fairly easy for a buggy controller to create them given the menagerie of tunnels, patches, etc. that OVS makes available. Commit ecbb6953b "datapath: Add loop checking" did the initial work toward suppressing loops, by dropping packets that recursed more than 5 times. This at least prevented the kernel stack from overflowing and thereby OOPSing the machine. But even with this commit, it is still possible to waste a lot of CPU time due to loops. The problem is not limited to 5 recursive calls per packet: any packet can be sent to multiple destinations, which in turn can themselves be sent to multiple destinations, and so on. We have actually seen in practice a case where each packet was, apparently, sent to at least 2 destinations per hop, so that each packet actually consumed CPU time for 2**5 == 32 packets, possibly more. This commit takes loop suppression a step further, by clearing the actions of flows that are implicated in loops. Thus, after the first packet in such a flow, later packets for either the "root" flow or for flows that it ends up looping through are simply discarded, saving a huge amount of CPU time. This version of the commit just clears the actions from the flows that a part of the loop. Probably, there should be some additional action to tell ovs-vswitchd that a loop has been detected, so that it can in turn inform the controller one way or another. My test case was this: ovs-controller -H --max-idle=permanent punix:/tmp/controller ovs-vsctl -- \ set-controller br0 unix:/tmp/controller -- \ add-port br0 patch00 -- \ add-port br0 patch01 -- \ add-port br0 patch10 -- \ add-port br0 patch11 -- \ add-port br0 patch20 -- \ add-port br0 patch21 -- \ add-port br0 patch30 -- \ add-port br0 patch31 -- \ set Interface patch00 type=patch options:peer=patch01 -- \ set Interface patch01 type=patch options:peer=patch00 -- \ set Interface patch10 type=patch options:peer=patch11 -- \ set Interface patch11 type=patch options:peer=patch10 -- \ set Interface patch20 type=patch options:peer=patch21 -- \ set Interface patch21 type=patch options:peer=patch20 -- \ set Interface patch30 type=patch options:peer=patch31 -- \ set Interface patch31 type=patch options:peer=patch30 followed by sending a single "ping" packet from an attached Ethernet port into the bridge. After this, without this commit the vswitch userspace and kernel consume 50-75% of the machine's CPU (in my KVM test setup on a single physical host); with this commit, some CPU is consumed initially but it converges on 0% quickly. A more challenging test sends a series of packets in multiple flows; I used "hping3" with its default options. Without this commit, the vswitch consumes 100% of the machine's CPU, most of which is in the kernel. With this commit, the vswitch consumes "only" 33-50% CPU, most of which is in userspace, so the machine is more responsive. A refinement on this commit would be to pass the loop counter down to userspace as part of the odp_msg struct and then back up as part of the ODP_EXECUTE command arguments. This would, presumably, reduce the CPU requirements, since it would allow loop detection to happen earlier, during initial setup of flows, instead of just on the second and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
out:
datapath: Detect and suppress flows that are implicated in loops. In-kernel loops need to be suppressed; otherwise, they cause high CPU consumption, even to the point that the machine becomes unusable. Ideally these flows should never be added to the Open vSwitch flow table, but it is fairly easy for a buggy controller to create them given the menagerie of tunnels, patches, etc. that OVS makes available. Commit ecbb6953b "datapath: Add loop checking" did the initial work toward suppressing loops, by dropping packets that recursed more than 5 times. This at least prevented the kernel stack from overflowing and thereby OOPSing the machine. But even with this commit, it is still possible to waste a lot of CPU time due to loops. The problem is not limited to 5 recursive calls per packet: any packet can be sent to multiple destinations, which in turn can themselves be sent to multiple destinations, and so on. We have actually seen in practice a case where each packet was, apparently, sent to at least 2 destinations per hop, so that each packet actually consumed CPU time for 2**5 == 32 packets, possibly more. This commit takes loop suppression a step further, by clearing the actions of flows that are implicated in loops. Thus, after the first packet in such a flow, later packets for either the "root" flow or for flows that it ends up looping through are simply discarded, saving a huge amount of CPU time. This version of the commit just clears the actions from the flows that a part of the loop. Probably, there should be some additional action to tell ovs-vswitchd that a loop has been detected, so that it can in turn inform the controller one way or another. My test case was this: ovs-controller -H --max-idle=permanent punix:/tmp/controller ovs-vsctl -- \ set-controller br0 unix:/tmp/controller -- \ add-port br0 patch00 -- \ add-port br0 patch01 -- \ add-port br0 patch10 -- \ add-port br0 patch11 -- \ add-port br0 patch20 -- \ add-port br0 patch21 -- \ add-port br0 patch30 -- \ add-port br0 patch31 -- \ set Interface patch00 type=patch options:peer=patch01 -- \ set Interface patch01 type=patch options:peer=patch00 -- \ set Interface patch10 type=patch options:peer=patch11 -- \ set Interface patch11 type=patch options:peer=patch10 -- \ set Interface patch20 type=patch options:peer=patch21 -- \ set Interface patch21 type=patch options:peer=patch20 -- \ set Interface patch30 type=patch options:peer=patch31 -- \ set Interface patch31 type=patch options:peer=patch30 followed by sending a single "ping" packet from an attached Ethernet port into the bridge. After this, without this commit the vswitch userspace and kernel consume 50-75% of the machine's CPU (in my KVM test setup on a single physical host); with this commit, some CPU is consumed initially but it converges on 0% quickly. A more challenging test sends a series of packets in multiple flows; I used "hping3" with its default options. Without this commit, the vswitch consumes 100% of the machine's CPU, most of which is in the kernel. With this commit, the vswitch consumes "only" 33-50% CPU, most of which is in userspace, so the machine is more responsive. A refinement on this commit would be to pass the loop counter down to userspace as part of the odp_msg struct and then back up as part of the ODP_EXECUTE command arguments. This would, presumably, reduce the CPU requirements, since it would allow loop detection to happen earlier, during initial setup of flows, instead of just on the second and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
/* Update datapath statistics. */
u64_stats_update_begin(&stats->syncp);
(*stats_counter)++;
stats->n_mask_hit += n_mask_hit;
u64_stats_update_end(&stats->syncp);
}
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct dp_stats_percpu *stats;
int err;
if (upcall_info->portid == 0) {
err = -ENOTCONN;
goto err;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
if (!skb_is_gso(skb))
err = queue_userspace_packet(dp, skb, key, upcall_info);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
else
err = queue_gso_packets(dp, skb, key, upcall_info);
if (err)
goto err;
return 0;
err:
stats = this_cpu_ptr(dp->stats_percpu);
u64_stats_update_begin(&stats->syncp);
stats->n_lost++;
u64_stats_update_end(&stats->syncp);
return err;
}
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
const struct dp_upcall_info *upcall_info)
{
unsigned short gso_type = skb_shinfo(skb)->gso_type;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
struct ovs_skb_cb ovs_cb;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
int err;
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
ovs_cb = *OVS_CB(skb);
segs = __skb_gso_segment(skb, NETIF_F_SG, false);
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
*OVS_CB(skb) = ovs_cb;
if (IS_ERR(segs))
return PTR_ERR(segs);
if (gso_type & SKB_GSO_UDP) {
/* The initial flow key extracted by ovs_flow_key_extract()
* in this case is for a first fragment, so we need to
* properly mark later fragments.
*/
later_key = *key;
later_key.ip.frag = OVS_FRAG_TYPE_LATER;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Queue all of the segments. */
skb = segs;
do {
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
*OVS_CB(skb) = ovs_cb;
if (gso_type & SKB_GSO_UDP && skb != segs)
key = &later_key;
err = queue_userspace_packet(dp, skb, key, upcall_info);
if (err)
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
break;
datapath: Report kernel's flow key when passing packets up to userspace. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. This commit takes one step in that direction by making the kernel report its idea of the flow that a packet belongs to whenever it passes a packet up to userspace. This means that userspace can intelligently figure out what to do: - If userspace's notion of the flow for the packet matches the kernel's, then nothing special is necessary. - If the kernel has a more specific notion for the flow than userspace, for example if the kernel decoded IPv6 headers but userspace stopped at the Ethernet type (because it does not understand IPv6), then again nothing special is necessary: userspace can still set up the flow in the usual way. - If userspace has a more specific notion for the flow than the kernel, for example if userspace decoded an IPv6 header but the kernel stopped at the Ethernet type, then userspace can forward the packet manually, without setting up a flow in the kernel. (This case is bad from a performance point of view, but at least it is correct.) This commit does not actually make userspace flexible enough to handle changes in the kernel flow key structure, although userspace does now have enough information to do that intelligently. This will have to wait for later commits. This commit is bigger than it would otherwise be because it is rolled together with changing "struct odp_msg" to a sequence of Netlink attributes. The alternative, to do each of those changes in a separate patch, seemed like overkill because it meant that either we would have to introduce and then kill off Netlink attributes for in_port and tun_id, if Netlink conversion went first, or shove yet another variable-length header into the stuff already after odp_msg, if adding the flow key to odp_msg went first. This commit will slow down performance of checksumming packets sent up to userspace. I'm not entirely pleased with how I did it. I considered a couple of alternatives, but none of them seemed that much better. Suggestions welcome. Not changing anything wasn't an option, unfortunately. At any rate some slowdown will become unavoidable when OVS actually starts using Netlink instead of just Netlink framing. (Actually, I thought of one option where we could avoid that: make userspace do the checksum instead, by passing csum_start and csum_offset as part of what goes to userspace. But that's not perfect either.) Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-24 14:59:57 -08:00
} while ((skb = skb->next));
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Free all of the segments. */
skb = segs;
do {
nskb = skb->next;
if (err)
kfree_skb(skb);
else
consume_skb(skb);
} while ((skb = nskb));
return err;
}
static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
unsigned int hdrlen)
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
/* OVS_PACKET_ATTR_USERDATA */
if (upcall_info->userdata)
size += NLA_ALIGN(upcall_info->userdata->nla_len);
/* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
if (upcall_info->egress_tun_info)
size += nla_total_size(ovs_tun_key_attr_size());
return size;
}
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
struct nlattr *nla;
struct genl_info info = {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
.snd_portid = upcall_info->portid,
};
size_t len;
unsigned int hlen;
int err, dp_ifindex;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
return -ENODEV;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
if (vlan_tx_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
if (!nskb)
return -ENOMEM;
vlan_set_tci(nskb, 0);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic. Only perform zerocopy if padding is not required.
*/
if (dp->user_features & OVS_DP_F_UNALIGNED)
hlen = skb_zerocopy_headlen(skb);
else
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
err = ovs_nla_put_flow(key, key, user_skb);
BUG_ON(err);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
nla_nest_end(user_skb, nla);
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
err = ovs_nla_put_egress_tunnel_key(user_skb,
upcall_info->egress_tun_info);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy() */
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
nla->nla_len = nla_attr_size(skb->len);
err = skb_zerocopy(user_skb, skb, skb->len, hlen);
if (err)
goto out;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
if (plen > 0)
memset(skb_put(user_skb, plen), 0, plen);
}
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
{
struct ovs_header *ovs_header = info->userhdr;
struct nlattr **a = info->attrs;
struct sw_flow_actions *acts;
struct sk_buff *packet;
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct datapath *dp;
struct ethhdr *eth;
struct vport *input_vport;
int len;
int err;
err = -EINVAL;
if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
!a[OVS_PACKET_ATTR_ACTIONS])
goto err;
len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
err = -ENOMEM;
if (!packet)
goto err;
skb_reserve(packet, NET_IP_ALIGN);
nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
skb_reset_mac_header(packet);
eth = eth_hdr(packet);
/* Normally, setting the skb 'protocol' field would be handled by a
* call to eth_type_trans(), but it assumes there's a sending
* device, which we may not have. */
if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
if (IS_ERR(flow))
goto err_kfree_skb;
err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet,
&flow->key);
if (err)
goto err_flow_free;
err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
&flow->key, &acts);
if (err)
goto err_flow_free;
rcu_assign_pointer(flow->sf_acts, acts);
datapath: Set packet egress_tun_info. packet execute is setting egress_tun_info in skb->cb, rather than packet->cb. skb is netlink msg skb. This causes corruption in netlink skb state stored in skb->cb (NETLINK_CB) which results in following deadlock in netlink code. ============================================= [ INFO: possible recursive locking detected ] 3.2.62 #2 --------------------------------------------- handler55/22851 is trying to acquire lock: (genl_mutex){+.+.+.}, at: [<ffffffff81471ad7>] genl_lock+0x17/0x20 but task is already holding lock: (genl_mutex){+.+.+.}, at: [<ffffffff81471ad7>] genl_lock+0x17/0x20 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(genl_mutex); lock(genl_mutex); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by handler55/22851: #0: (genl_mutex){+.+.+.}, at: [<ffffffff81471ad7>] genl_lock+0x17/0x20 stack backtrace: Pid: 22851, comm: handler55 Tainted: G O 3.2.62 #2 Call Trace: [<ffffffff81097bb2>] print_deadlock_bug+0xf2/0x100 [<ffffffff81099b99>] validate_chain+0x579/0x860 [<ffffffff8109a17c>] __lock_acquire+0x2fc/0x4f0 [<ffffffff8109aab0>] lock_acquire+0xa0/0x180 [<ffffffff81519070>] __mutex_lock_common+0x60/0x420 [<ffffffff8151959a>] mutex_lock_nested+0x4a/0x60 [<ffffffff81471ad7>] genl_lock+0x17/0x20 [<ffffffff81471af6>] genl_rcv+0x16/0x40 [<ffffffff8146ff72>] netlink_unicast+0x2f2/0x310 [<ffffffff81470159>] netlink_ack+0x109/0x1f0 [<ffffffff8147030b>] netlink_rcv_skb+0xcb/0xd0 [<ffffffff81471b05>] genl_rcv+0x25/0x40 [<ffffffff8146ff72>] netlink_unicast+0x2f2/0x310 [<ffffffff8147134c>] netlink_sendmsg+0x28c/0x3d0 [<ffffffff8143375f>] sock_sendmsg+0xef/0x120 [<ffffffff81435766>] ___sys_sendmsg+0x416/0x430 [<ffffffff81435949>] __sys_sendmsg+0x49/0x90 [<ffffffff814359a9>] sys_sendmsg+0x19/0x20 [<ffffffff8152432b>] system_call_fastpath+0x16/0x1b Reported-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Joe Stringer <joestringer@nicira.com>
2014-09-07 15:18:07 -07:00
OVS_CB(packet)->egress_tun_info = NULL;
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
rcu_read_lock();
dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
err = -ENODEV;
if (!dp)
goto err_unlock;
input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
if (!input_vport)
input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);
if (!input_vport)
goto err_unlock;
OVS_CB(packet)->input_vport = input_vport;
sf_acts = rcu_dereference(flow->sf_acts);
local_bh_disable();
err = ovs_execute_actions(dp, packet, &flow->key, sf_acts);
local_bh_enable();
rcu_read_unlock();
ovs_flow_free(flow, false);
return err;
err_unlock:
rcu_read_unlock();
err_flow_free:
ovs_flow_free(flow, false);
err_kfree_skb:
kfree_skb(packet);
err:
return err;
}
static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
};
static struct genl_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = packet_policy,
.doit = ovs_packet_cmd_execute
}
};
static struct genl_family dp_packet_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_PACKET_FAMILY,
.version = OVS_PACKET_VERSION,
.maxattr = OVS_PACKET_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_packet_genl_ops,
.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
};
static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats,
struct ovs_dp_megaflow_stats *mega_stats)
{
int i;
memset(mega_stats, 0, sizeof(*mega_stats));
stats->n_flows = ovs_flow_tbl_count(&dp->table);
mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
stats->n_hit = stats->n_missed = stats->n_lost = 0;
for_each_possible_cpu(i) {
const struct dp_stats_percpu *percpu_stats;
struct dp_stats_percpu local_stats;
unsigned int start;
percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
do {
start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
local_stats = *percpu_stats;
} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
stats->n_lost += local_stats.n_lost;
mega_stats->n_mask_hit += local_stats.n_mask_hit;
}
}
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
return NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */
+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
+ nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_match(const struct sw_flow *flow,
struct sk_buff *skb)
{
struct nlattr *nla;
int err;
/* Fill flow key. */
nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
if (!nla)
return -EMSGSIZE;
err = ovs_nla_put_flow(&flow->unmasked_key,
&flow->unmasked_key, skb);
if (err)
return err;
nla_nest_end(skb, nla);
/* Fill flow mask. */
nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
if (!nla)
return -EMSGSIZE;
err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
if (err)
return err;
nla_nest_end(skb, nla);
return 0;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
struct sk_buff *skb)
{
struct ovs_flow_stats stats;
__be16 tcp_flags;
unsigned long used;
ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
if (used &&
nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
return -EMSGSIZE;
if (stats.n_packets &&
nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
return -EMSGSIZE;
if ((u8)ntohs(tcp_flags) &&
nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
return -EMSGSIZE;
return 0;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
struct sk_buff *skb, int skb_orig_len)
{
struct nlattr *start;
int err;
/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
* this is the first flow to be dumped into 'skb'. This is unusual for
* Netlink but individual action lists can be longer than
* NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
* The userspace caller can always fetch the actions separately if it
* really wants them. (Most userspace callers in fact don't care.)
*
* This can only fail for dump operations because the skb is always
* properly sized for single flows.
*/
start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
if (start) {
const struct sw_flow_actions *sf_acts;
sf_acts = rcu_dereference_ovsl(flow->sf_acts);
err = ovs_nla_put_actions(sf_acts->actions,
sf_acts->actions_len, skb);
if (!err)
nla_nest_end(skb, start);
else {
if (skb_orig_len)
return err;
nla_nest_cancel(skb, start);
}
} else if (skb_orig_len) {
return -EMSGSIZE;
}
return 0;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
struct sk_buff *skb, u32 portid,
u32 seq, u32 flags, u8 cmd)
{
const int skb_orig_len = skb->len;
struct ovs_header *ovs_header;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd);
if (!ovs_header)
return -EMSGSIZE;
ovs_header->dp_ifindex = dp_ifindex;
err = ovs_flow_cmd_fill_match(flow, skb);
if (err)
goto error;
err = ovs_flow_cmd_fill_stats(flow, skb);
if (err)
goto error;
err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
if (err)
goto error;
return genlmsg_end(skb, ovs_header);
error:
genlmsg_cancel(skb, ovs_header);
return err;
}
/* May not be called with RCU read lock. */
static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
struct genl_info *info,
bool always)
{
struct sk_buff *skb;
if (!always && !ovs_must_notify(&dp_flow_genl_family, info,
GROUP_ID(&ovs_dp_flow_multicast_group)))
return NULL;
skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL);
if (!skb)
return ERR_PTR(-ENOMEM);
return skb;
}
/* Called with ovs_mutex. */
static struct sk_buff *ovs_flow_cmd_build_info(struct datapath *dp,
const struct sw_flow *flow,
int dp_ifindex,
struct genl_info *info, u8 cmd,
bool always)
{
struct sk_buff *skb;
int retval;
skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info,
always);
if (IS_ERR_OR_NULL(skb))
return skb;
retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
info->snd_portid, info->snd_seq, 0,
cmd);
BUG_ON(retval < 0);
return skb;
}
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow *flow, *new_flow;
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
struct sw_flow_actions *acts;
struct sw_flow_match match;
int error;
/* Must have key and actions. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY]) {
OVS_NLERR("Flow key attribute not present in new flow.\n");
goto error;
}
if (!a[OVS_FLOW_ATTR_ACTIONS]) {
OVS_NLERR("Flow actions attribute not present in new flow.\n");
goto error;
}
/* Most of the time we need to allocate a new flow, do it before
* locking. */
new_flow = ovs_flow_alloc();
if (IS_ERR(new_flow)) {
error = PTR_ERR(new_flow);
goto error;
}
/* Extract key. */
ovs_match_init(&match, &new_flow->unmasked_key, &mask);
error = ovs_nla_get_match(&match,
a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
if (error)
goto err_kfree_flow;
ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask);
/* Validate actions. */
error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
&acts);
if (error) {
OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
goto err_kfree_flow;
}
reply = ovs_flow_cmd_alloc_info(acts, info, false);
if (IS_ERR(reply)) {
error = PTR_ERR(reply);
goto err_kfree_acts;
}
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
}
/* Check if this is a duplicate flow */
flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
/* Put flow in bucket. */
error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
if (unlikely(error)) {
acts = NULL;
goto err_unlock_ovs;
}
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(new_flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW);
BUG_ON(error < 0);
}
ovs_unlock();
} else {
struct sw_flow_actions *old_acts;
/* Bail out if we're not allowed to modify an existing flow.
* We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
* because Generic Netlink treats the latter as a dump
* request. We also accept NLM_F_EXCL in case that bug ever
* gets fixed.
*/
if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
| NLM_F_EXCL))) {
error = -EEXIST;
goto err_unlock_ovs;
}
/* The unmasked key has to be the same for flow updates. */
if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
/* Look for any overlapping flow. */
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
if (!flow) {
error = -ENOENT;
goto err_unlock_ovs;
}
}
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW);
BUG_ON(error < 0);
}
ovs_unlock();
ovs_nla_free_flow_actions(old_acts);
ovs_flow_free(new_flow, false);
}
if (reply)
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
return 0;
err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
kfree(acts);
err_kfree_flow:
ovs_flow_free(new_flow, false);
error:
return error;
}
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
const struct sw_flow_key *key,
const struct sw_flow_mask *mask)
{
struct sw_flow_actions *acts;
struct sw_flow_key masked_key;
int error;
ovs_flow_mask_key(&masked_key, key, mask);
error = ovs_nla_copy_actions(a, &masked_key, &acts);
if (error) {
OVS_NLERR("Actions may not be safe on all matching packets.\n");
return ERR_PTR(error);
}
return acts;
}
static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
struct sw_flow_mask mask;
struct sk_buff *reply = NULL;
struct datapath *dp;
struct sw_flow_actions *old_acts = NULL, *acts = NULL;
struct sw_flow_match match;
int error;
/* Extract key. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY]) {
OVS_NLERR("Flow key attribute not present in set flow.\n");
goto error;
}
ovs_match_init(&match, &key, &mask);
error = ovs_nla_get_match(&match,
a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
if (error)
goto error;
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask);
if (IS_ERR(acts)) {
error = PTR_ERR(acts);
goto error;
}
/* Can allocate before locking if have acts. */
reply = ovs_flow_cmd_alloc_info(acts, info, false);
if (IS_ERR(reply)) {
error = PTR_ERR(reply);
goto err_kfree_acts;
}
}
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
}
/* Check that the flow exists. */
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
if (unlikely(!flow)) {
error = -ENOENT;
goto err_unlock_ovs;
}
/* Update actions, if present. */
if (likely(acts)) {
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW);
BUG_ON(error < 0);
}
} else {
/* Could not alloc without acts before locking. */
reply = ovs_flow_cmd_build_info(dp, flow,
ovs_header->dp_ifindex,
info, OVS_FLOW_CMD_NEW, false);
if (unlikely(IS_ERR(reply))) {
error = PTR_ERR(reply);
goto err_unlock_ovs;
}
}
/* Clear stats. */
if (a[OVS_FLOW_ATTR_CLEAR])
ovs_flow_stats_clear(flow);
ovs_unlock();
if (reply)
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
if (old_acts)
ovs_nla_free_flow_actions(old_acts);
return 0;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
kfree(acts);
error:
return error;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
}
static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sk_buff *reply;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
struct sw_flow *flow;
struct datapath *dp;
struct sw_flow_match match;
int err;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
if (!a[OVS_FLOW_ATTR_KEY]) {
OVS_NLERR("Flow get message rejected, Key attribute missing.\n");
return -EINVAL;
}
ovs_match_init(&match, &key, NULL);
err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
return err;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
if (!dp) {
err = -ENODEV;
goto unlock;
}
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
if (!flow) {
err = -ENOENT;
goto unlock;
}
reply = ovs_flow_cmd_build_info(dp, flow, ovs_header->dp_ifindex, info,
OVS_FLOW_CMD_NEW, true);
if (IS_ERR(reply)) {
err = PTR_ERR(reply);
goto unlock;
}
ovs_unlock();
return genlmsg_reply(reply, info);
unlock:
ovs_unlock();
return err;
}
static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sk_buff *reply;
struct sw_flow *flow;
struct datapath *dp;
struct sw_flow_match match;
int err;
if (likely(a[OVS_FLOW_ATTR_KEY])) {
ovs_match_init(&match, &key, NULL);
err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (unlikely(err))
return err;
}
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
if (unlikely(!dp)) {
err = -ENODEV;
goto unlock;
}
if (unlikely(!a[OVS_FLOW_ATTR_KEY])) {
err = ovs_flow_tbl_flush(&dp->table);
goto unlock;
}
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
if (unlikely(!flow)) {
err = -ENOENT;
goto unlock;
}
ovs_flow_tbl_remove(&dp->table, flow);
ovs_unlock();
reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *)flow->sf_acts,
info, false);
if (likely(reply)) {
if (likely(!IS_ERR(reply))) {
rcu_read_lock(); /* Keep RCU checker happy. */
err = ovs_flow_cmd_fill_info(flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_DEL);
rcu_read_unlock();
BUG_ON(err < 0);
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
} else {
genl_set_err(&dp_flow_genl_family, sock_net(skb->sk), 0,
GROUP_ID(&ovs_dp_flow_multicast_group), PTR_ERR(reply));
}
}
ovs_flow_free(flow, true);
return 0;
unlock:
ovs_unlock();
return err;
}
static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
struct table_instance *ti;
struct datapath *dp;
rcu_read_lock();
dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
if (!dp) {
rcu_read_unlock();
return -ENODEV;
}
ti = rcu_dereference(dp->table.ti);
for (;;) {
struct sw_flow *flow;
u32 bucket, obj;
bucket = cb->args[0];
obj = cb->args[1];
flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
if (!flow)
break;
if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
OVS_FLOW_CMD_NEW) < 0)
break;
cb->args[0] = bucket;
cb->args[1] = obj;
}
rcu_read_unlock();
return skb->len;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
}
static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
[OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
};
static struct genl_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_new
},
{ .cmd = OVS_FLOW_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_del
},
{ .cmd = OVS_FLOW_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = flow_policy,
.doit = ovs_flow_cmd_get,
.dumpit = ovs_flow_cmd_dump
},
{ .cmd = OVS_FLOW_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_set,
},
};
static struct genl_family dp_flow_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
.maxattr = OVS_FLOW_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_flow_genl_ops,
.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
};
static size_t ovs_dp_cmd_msg_size(void)
{
size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
msgsize += nla_total_size(IFNAMSIZ);
msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
return msgsize;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
u32 portid, u32 seq, u32 flags, u8 cmd)
{
struct ovs_header *ovs_header;
struct ovs_dp_stats dp_stats;
struct ovs_dp_megaflow_stats dp_megaflow_stats;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
flags, cmd);
if (!ovs_header)
goto error;
ovs_header->dp_ifindex = get_dpifindex(dp);
err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
if (err)
goto nla_put_failure;
get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
&dp_stats))
goto nla_put_failure;
if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
sizeof(struct ovs_dp_megaflow_stats),
&dp_megaflow_stats))
goto nla_put_failure;
if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
goto nla_put_failure;
return genlmsg_end(skb, ovs_header);
nla_put_failure:
genlmsg_cancel(skb, ovs_header);
error:
return -EMSGSIZE;
}
static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
{
return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL);
}
/* Called with rcu_read_lock or ovs_mutex. */
static struct datapath *lookup_datapath(struct net *net,
struct ovs_header *ovs_header,
struct nlattr *a[OVS_DP_ATTR_MAX + 1])
{
struct datapath *dp;
if (!a[OVS_DP_ATTR_NAME])
dp = get_dp(net, ovs_header->dp_ifindex);
else {
struct vport *vport;
vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
}
return dp ? dp : ERR_PTR(-ENODEV);
}
static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
{
struct datapath *dp;
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
if (IS_ERR(dp))
return;
WARN(dp->user_features, "Dropping previously announced user features\n");
dp->user_features = 0;
}
static void ovs_dp_change(struct datapath *dp, struct nlattr **a)
{
if (a[OVS_DP_ATTR_USER_FEATURES])
dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
}
static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct vport_parms parms;
struct sk_buff *reply;
struct datapath *dp;
struct vport *vport;
struct ovs_net *ovs_net;
int err, i;
err = -EINVAL;
if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
goto err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
err = -ENOMEM;
dp = kzalloc(sizeof(*dp), GFP_KERNEL);
if (dp == NULL)
goto err_free_reply;
ovs_dp_set_net(dp, hold_net(sock_net(skb->sk)));
/* Allocate table. */
err = ovs_flow_tbl_init(&dp->table);
if (err)
goto err_free_dp;
dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
if (!dp->stats_percpu) {
err = -ENOMEM;
goto err_destroy_table;
}
dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
GFP_KERNEL);
if (!dp->ports) {
err = -ENOMEM;
goto err_destroy_percpu;
}
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(&dp->ports[i]);
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
parms.options = NULL;
parms.dp = dp;
parms.port_no = OVSP_LOCAL;
parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
ovs_dp_change(dp, a);
/* So far only local changes have been made, now need the lock. */
ovs_lock();
vport = new_vport(&parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
if (err == -EBUSY)
err = -EEXIST;
if (err == -EEXIST) {
/* An outdated user space instance that does not understand
* the concept of user_features has attempted to create a new
* datapath and is likely to reuse it. Drop all user features.
*/
if (info->genlhdr->version < OVS_DP_VER_FEATURES)
ovs_dp_reset_user_features(skb, info);
}
goto err_destroy_ports_array;
}
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_NEW);
BUG_ON(err < 0);
ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
ovs_unlock();
ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
return 0;
err_destroy_ports_array:
ovs_unlock();
kfree(dp->ports);
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
ovs_flow_tbl_destroy(&dp->table);
err_free_dp:
release_net(ovs_dp_get_net(dp));
kfree(dp);
err_free_reply:
kfree_skb(reply);
err:
return err;
}
/* Called with ovs_mutex. */
static void __dp_destroy(struct datapath *dp)
{
int i;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
struct hlist_node *n;
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
if (vport->port_no != OVSP_LOCAL)
ovs_dp_detach_port(vport);
}
list_del_rcu(&dp->list_node);
/* OVSP_LOCAL is datapath internal port. We need to make sure that
* all ports in datapath are destroyed first before freeing datapath.
*/
ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
/* RCU destroy the flow table */
call_rcu(&dp->rcu, destroy_dp_rcu);
}
static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *reply;
struct datapath *dp;
int err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
ovs_lock();
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
err = PTR_ERR(dp);
if (IS_ERR(dp))
goto err_unlock_free;
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_DEL);
BUG_ON(err < 0);
__dp_destroy(dp);
ovs_unlock();
ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
return 0;
err_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *reply;
struct datapath *dp;
int err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
ovs_lock();
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
err = PTR_ERR(dp);
if (IS_ERR(dp))
goto err_unlock_free;
ovs_dp_change(dp, info->attrs);
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_NEW);
BUG_ON(err < 0);
ovs_unlock();
ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
return 0;
err_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *reply;
struct datapath *dp;
int err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
rcu_read_lock();
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
if (IS_ERR(dp)) {
err = PTR_ERR(dp);
goto err_unlock_free;
}
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_NEW);
BUG_ON(err < 0);
rcu_read_unlock();
return genlmsg_reply(reply, info);
err_unlock_free:
rcu_read_unlock();
kfree_skb(reply);
return err;
}
static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
struct datapath *dp;
int skip = cb->args[0];
int i = 0;
rcu_read_lock();
list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) {
if (i >= skip &&
ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
OVS_DP_CMD_NEW) < 0)
break;
i++;
}
rcu_read_unlock();
cb->args[0] = i;
return skb->len;
}
static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
[OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
};
static struct genl_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_new
},
{ .cmd = OVS_DP_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_del
},
{ .cmd = OVS_DP_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_get,
.dumpit = ovs_dp_cmd_dump
},
{ .cmd = OVS_DP_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_set,
},
};
static struct genl_family dp_datapath_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
.maxattr = OVS_DP_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_datapath_genl_ops,
.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
};
/* Called with ovs_mutex or RCU read lock. */
static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
u32 portid, u32 seq, u32 flags, u8 cmd)
{
struct ovs_header *ovs_header;
struct ovs_vport_stats vport_stats;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
flags, cmd);
if (!ovs_header)
return -EMSGSIZE;
ovs_header->dp_ifindex = get_dpifindex(vport->dp);
if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)))
goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
&vport_stats))
goto nla_put_failure;
if (ovs_vport_get_upcall_portids(vport, skb))
goto nla_put_failure;
err = ovs_vport_get_options(vport, skb);
if (err == -EMSGSIZE)
goto error;
return genlmsg_end(skb, ovs_header);
nla_put_failure:
err = -EMSGSIZE;
error:
genlmsg_cancel(skb, ovs_header);
return err;
}
static struct sk_buff *ovs_vport_cmd_alloc_info(void)
{
return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
}
/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
u32 seq, u8 cmd)
{
struct sk_buff *skb;
int retval;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb)
return ERR_PTR(-ENOMEM);
retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
BUG_ON(retval < 0);
return skb;
}
/* Called with ovs_mutex or RCU read lock. */
static struct vport *lookup_vport(struct net *net,
struct ovs_header *ovs_header,
struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
{
struct datapath *dp;
struct vport *vport;
if (a[OVS_VPORT_ATTR_NAME]) {
vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
if (!vport)
return ERR_PTR(-ENODEV);
if (ovs_header->dp_ifindex &&
ovs_header->dp_ifindex != get_dpifindex(vport->dp))
return ERR_PTR(-ENODEV);
return vport;
} else if (a[OVS_VPORT_ATTR_PORT_NO]) {
u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
if (port_no >= DP_MAX_PORTS)
return ERR_PTR(-EFBIG);
dp = get_dp(net, ovs_header->dp_ifindex);
if (!dp)
return ERR_PTR(-ENODEV);
vport = ovs_vport_ovsl_rcu(dp, port_no);
if (!vport)
return ERR_PTR(-ENODEV);
return vport;
} else
return ERR_PTR(-EINVAL);
}
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct vport_parms parms;
struct sk_buff *reply;
struct vport *vport;
struct datapath *dp;
2011-01-10 13:12:12 -08:00
u32 port_no;
int err;
2011-01-10 13:12:12 -08:00
if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
!a[OVS_VPORT_ATTR_UPCALL_PID])
return -EINVAL;
port_no = a[OVS_VPORT_ATTR_PORT_NO]
? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
if (port_no >= DP_MAX_PORTS)
return -EFBIG;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
err = -ENODEV;
if (!dp)
goto exit_unlock_free;
if (port_no) {
vport = ovs_vport_ovsl(dp, port_no);
err = -EBUSY;
if (vport)
goto exit_unlock_free;
} else {
for (port_no = 1; ; port_no++) {
if (port_no >= DP_MAX_PORTS) {
err = -EFBIG;
goto exit_unlock_free;
}
vport = ovs_vport_ovsl(dp, port_no);
if (!vport)
break;
}
}
2011-01-10 13:12:12 -08:00
parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
parms.options = a[OVS_VPORT_ATTR_OPTIONS];
parms.dp = dp;
parms.port_no = port_no;
parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
vport = new_vport(&parms);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock_free;
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
BUG_ON(err < 0);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
return 0;
exit_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct sk_buff *reply;
struct vport *vport;
int err;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
ovs_lock();
vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock_free;
if (a[OVS_VPORT_ATTR_TYPE] &&
nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
err = -EINVAL;
goto exit_unlock_free;
}
if (a[OVS_VPORT_ATTR_OPTIONS]) {
err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
if (err)
goto exit_unlock_free;
}
if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
err = ovs_vport_set_upcall_portids(vport,
a[OVS_VPORT_ATTR_UPCALL_PID]);
if (err)
goto exit_unlock_free;
}
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
BUG_ON(err < 0);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
return 0;
exit_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct sk_buff *reply;
struct vport *vport;
int err;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
ovs_lock();
vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock_free;
if (vport->port_no == OVSP_LOCAL) {
err = -EINVAL;
goto exit_unlock_free;
}
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_DEL);
BUG_ON(err < 0);
ovs_dp_detach_port(vport);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
return 0;
exit_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sk_buff *reply;
struct vport *vport;
int err;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
rcu_read_lock();
vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock_free;
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
BUG_ON(err < 0);
rcu_read_unlock();
return genlmsg_reply(reply, info);
exit_unlock_free:
rcu_read_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
struct datapath *dp;
int bucket = cb->args[0], skip = cb->args[1];
int i, j = 0;
rcu_read_lock();
dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
if (!dp) {
rcu_read_unlock();
return -ENODEV;
}
for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
j = 0;
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
if (j >= skip &&
ovs_vport_cmd_fill_info(vport, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
OVS_VPORT_CMD_NEW) < 0)
goto out;
j++;
}
skip = 0;
}
out:
rcu_read_unlock();
cb->args[0] = i;
cb->args[1] = j;
return skb->len;
}
static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
[OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
};
static struct genl_ops dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_new
},
{ .cmd = OVS_VPORT_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_del
},
{ .cmd = OVS_VPORT_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = vport_policy,
.doit = ovs_vport_cmd_get,
.dumpit = ovs_vport_cmd_dump
},
{ .cmd = OVS_VPORT_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_set,
},
};
struct genl_family dp_vport_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_VPORT_FAMILY,
.version = OVS_VPORT_VERSION,
.maxattr = OVS_VPORT_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_vport_genl_ops,
.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
};
static struct genl_family *dp_genl_families[] = {
&dp_datapath_genl_family,
&dp_vport_genl_family,
&dp_flow_genl_family,
&dp_packet_genl_family,
};
static void dp_unregister_genl(int n_families)
{
int i;
for (i = 0; i < n_families; i++)
genl_unregister_family(dp_genl_families[i]);
}
static int dp_register_genl(void)
{
int err;
int i;
for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
err = genl_register_family(dp_genl_families[i]);
if (err)
goto error;
}
return 0;
error:
dp_unregister_genl(i);
return err;
}
static int __net_init ovs_init_net(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
return 0;
}
static void __net_exit ovs_exit_net(struct net *net)
{
struct datapath *dp, *dp_next;
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
ovs_lock();
list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
__dp_destroy(dp);
ovs_unlock();
cancel_work_sync(&ovs_net->dp_notify_work);
}
static struct pernet_operations ovs_net_ops = {
.init = ovs_init_net,
.exit = ovs_exit_net,
.id = &ovs_net_id,
.size = sizeof(struct ovs_net),
};
DEFINE_COMPAT_PNET_REG_FUNC(device);
static int __init dp_init(void)
{
int err;
BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath %s, built "__DATE__" "__TIME__"\n",
VERSION);
err = action_fifos_init();
if (err)
goto error;
err = ovs_flow_init();
if (err)
goto error_action_fifos_exit;
err = ovs_vport_init();
if (err)
goto error_flow_exit;
err = register_pernet_device(&ovs_net_ops);
if (err)
goto error_vport_exit;
err = register_netdevice_notifier(&ovs_dp_device_notifier);
if (err)
goto error_netns_exit;
err = dp_register_genl();
if (err < 0)
goto error_unreg_notifier;
return 0;
error_unreg_notifier:
unregister_netdevice_notifier(&ovs_dp_device_notifier);
error_netns_exit:
unregister_pernet_device(&ovs_net_ops);
error_vport_exit:
ovs_vport_exit();
error_flow_exit:
ovs_flow_exit();
error_action_fifos_exit:
action_fifos_exit();
error:
return err;
}
static void dp_cleanup(void)
{
dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
unregister_netdevice_notifier(&ovs_dp_device_notifier);
unregister_pernet_device(&ovs_net_ops);
rcu_barrier();
ovs_vport_exit();
ovs_flow_exit();
action_fifos_exit();
}
module_init(dp_init);
module_exit(dp_cleanup);
MODULE_DESCRIPTION("Open vSwitch switching datapath");
MODULE_LICENSE("GPL");
MODULE_VERSION(VERSION);