2
0
mirror of https://github.com/openvswitch/ovs synced 2025-09-07 01:35:46 +00:00
Files
ovs/datapath/datapath.c

2402 lines
58 KiB
C
Raw Normal View History

/*
* Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/time.h>
#include <linux/etherdevice.h>
#include <linux/genetlink.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/version.h>
#include <linux/ethtool.h>
#include <linux/wait.h>
#include <asm/div64.h>
#include <linux/highmem.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/inetdevice.h>
#include <linux/list.h>
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include "datapath.h"
#include "flow.h"
#include "flow_table.h"
#include "flow_netlink.h"
#include "gso.h"
#include "vlan.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
int ovs_net_id __read_mostly;
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
EXPORT_SYMBOL_GPL(ovs_net_id);
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
static struct genl_family dp_datapath_genl_family;
static const struct nla_policy flow_policy[];
static struct genl_multicast_group ovs_dp_flow_multicast_group = {
.name = OVS_FLOW_MCGROUP
};
static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
.name = OVS_DATAPATH_MCGROUP
};
struct genl_multicast_group ovs_dp_vport_multicast_group = {
.name = OVS_VPORT_MCGROUP
};
/* Check if need to build a reply message.
* OVS userspace sets the NLM_F_ECHO flag if it needs the reply.
*/
static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
unsigned int group)
{
return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
genl_has_listeners(family, genl_info_net(info), group);
}
static void ovs_notify(struct genl_family *family, struct genl_multicast_group *grp,
struct sk_buff *skb, struct genl_info *info)
{
genl_notify(family, skb, genl_info_net(info),
info->snd_portid, GROUP_ID(grp), info->nlhdr, GFP_KERNEL);
}
/**
* DOC: Locking:
*
* All writes e.g. Writes to device state (add/remove datapath, port, set
* operations on vports, etc.), Writes to other state (flow table
* modifications, set miscellaneous datapath parameters, etc.) are protected
* by ovs_lock.
*
* Reads are protected by RCU.
*
* There are a few special cases (mostly stats) that have their own
* synchronization but they nest under all of above and don't interact with
* each other.
*
* The RTNL lock nests inside ovs_mutex.
*/
static DEFINE_MUTEX(ovs_mutex);
void ovs_lock(void)
{
mutex_lock(&ovs_mutex);
}
void ovs_unlock(void)
{
mutex_unlock(&ovs_mutex);
}
#ifdef CONFIG_LOCKDEP
int lockdep_ovsl_is_held(void)
{
if (debug_locks)
return lockdep_is_held(&ovs_mutex);
else
return 1;
}
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
#endif
static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
const struct sw_flow_key *,
const struct dp_upcall_info *);
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
const struct sw_flow_key *,
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
const struct dp_upcall_info *);
/* Must be called with rcu_read_lock. */
static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
{
struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
if (dev) {
struct vport *vport = ovs_internal_dev_get_vport(dev);
if (vport)
return vport->dp;
}
return NULL;
}
/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
* returned dp pointer valid.
*/
static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
{
struct datapath *dp;
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
rcu_read_lock();
dp = get_dp_rcu(net, dp_ifindex);
rcu_read_unlock();
return dp;
}
/* Must be called with rcu_read_lock or ovs_mutex. */
const char *ovs_dp_name(const struct datapath *dp)
{
struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
return ovs_vport_name(vport);
}
static int get_dpifindex(const struct datapath *dp)
{
struct vport *local;
int ifindex;
rcu_read_lock();
local = ovs_vport_rcu(dp, OVSP_LOCAL);
if (local)
ifindex = local->dev->ifindex;
else
ifindex = 0;
rcu_read_unlock();
return ifindex;
}
static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
ovs_flow_tbl_destroy(&dp->table);
free_percpu(dp->stats_percpu);
kfree(dp->ports);
kfree(dp);
}
static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
u16 port_no)
{
return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
}
/* Called with ovs_mutex or RCU read lock. */
struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
{
struct vport *vport;
struct hlist_head *head;
head = vport_hash_bucket(dp, port_no);
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
if (vport->port_no == port_no)
return vport;
}
return NULL;
}
/* Called with ovs_mutex. */
static struct vport *new_vport(const struct vport_parms *parms)
{
struct vport *vport;
vport = ovs_vport_add(parms);
if (!IS_ERR(vport)) {
struct datapath *dp = parms->dp;
struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
hlist_add_head_rcu(&vport->dp_hash_node, head);
}
return vport;
}
void ovs_dp_detach_port(struct vport *p)
{
ASSERT_OVSL();
/* First drop references to device. */
hlist_del_rcu(&p->dp_hash_node);
/* Then destroy it. */
ovs_vport_del(p);
}
/* Must be called with rcu_read_lock. */
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
const struct vport *p = OVS_CB(skb)->input_vport;
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct dp_stats_percpu *stats;
u64 *stats_counter;
u32 n_mask_hit;
stats = this_cpu_ptr(dp->stats_percpu);
/* Look up flow. */
flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
&n_mask_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
int error;
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
upcall.mru = OVS_CB(skb)->mru;
error = ovs_dp_upcall(dp, skb, key, &upcall);
if (unlikely(error))
kfree_skb(skb);
else
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
ovs_flow_stats_update(flow, key->tp.flags, skb);
sf_acts = rcu_dereference(flow->sf_acts);
ovs_execute_actions(dp, skb, sf_acts, key);
stats_counter = &stats->n_hit;
datapath: Detect and suppress flows that are implicated in loops. In-kernel loops need to be suppressed; otherwise, they cause high CPU consumption, even to the point that the machine becomes unusable. Ideally these flows should never be added to the Open vSwitch flow table, but it is fairly easy for a buggy controller to create them given the menagerie of tunnels, patches, etc. that OVS makes available. Commit ecbb6953b "datapath: Add loop checking" did the initial work toward suppressing loops, by dropping packets that recursed more than 5 times. This at least prevented the kernel stack from overflowing and thereby OOPSing the machine. But even with this commit, it is still possible to waste a lot of CPU time due to loops. The problem is not limited to 5 recursive calls per packet: any packet can be sent to multiple destinations, which in turn can themselves be sent to multiple destinations, and so on. We have actually seen in practice a case where each packet was, apparently, sent to at least 2 destinations per hop, so that each packet actually consumed CPU time for 2**5 == 32 packets, possibly more. This commit takes loop suppression a step further, by clearing the actions of flows that are implicated in loops. Thus, after the first packet in such a flow, later packets for either the "root" flow or for flows that it ends up looping through are simply discarded, saving a huge amount of CPU time. This version of the commit just clears the actions from the flows that a part of the loop. Probably, there should be some additional action to tell ovs-vswitchd that a loop has been detected, so that it can in turn inform the controller one way or another. My test case was this: ovs-controller -H --max-idle=permanent punix:/tmp/controller ovs-vsctl -- \ set-controller br0 unix:/tmp/controller -- \ add-port br0 patch00 -- \ add-port br0 patch01 -- \ add-port br0 patch10 -- \ add-port br0 patch11 -- \ add-port br0 patch20 -- \ add-port br0 patch21 -- \ add-port br0 patch30 -- \ add-port br0 patch31 -- \ set Interface patch00 type=patch options:peer=patch01 -- \ set Interface patch01 type=patch options:peer=patch00 -- \ set Interface patch10 type=patch options:peer=patch11 -- \ set Interface patch11 type=patch options:peer=patch10 -- \ set Interface patch20 type=patch options:peer=patch21 -- \ set Interface patch21 type=patch options:peer=patch20 -- \ set Interface patch30 type=patch options:peer=patch31 -- \ set Interface patch31 type=patch options:peer=patch30 followed by sending a single "ping" packet from an attached Ethernet port into the bridge. After this, without this commit the vswitch userspace and kernel consume 50-75% of the machine's CPU (in my KVM test setup on a single physical host); with this commit, some CPU is consumed initially but it converges on 0% quickly. A more challenging test sends a series of packets in multiple flows; I used "hping3" with its default options. Without this commit, the vswitch consumes 100% of the machine's CPU, most of which is in the kernel. With this commit, the vswitch consumes "only" 33-50% CPU, most of which is in userspace, so the machine is more responsive. A refinement on this commit would be to pass the loop counter down to userspace as part of the odp_msg struct and then back up as part of the ODP_EXECUTE command arguments. This would, presumably, reduce the CPU requirements, since it would allow loop detection to happen earlier, during initial setup of flows, instead of just on the second and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
out:
datapath: Detect and suppress flows that are implicated in loops. In-kernel loops need to be suppressed; otherwise, they cause high CPU consumption, even to the point that the machine becomes unusable. Ideally these flows should never be added to the Open vSwitch flow table, but it is fairly easy for a buggy controller to create them given the menagerie of tunnels, patches, etc. that OVS makes available. Commit ecbb6953b "datapath: Add loop checking" did the initial work toward suppressing loops, by dropping packets that recursed more than 5 times. This at least prevented the kernel stack from overflowing and thereby OOPSing the machine. But even with this commit, it is still possible to waste a lot of CPU time due to loops. The problem is not limited to 5 recursive calls per packet: any packet can be sent to multiple destinations, which in turn can themselves be sent to multiple destinations, and so on. We have actually seen in practice a case where each packet was, apparently, sent to at least 2 destinations per hop, so that each packet actually consumed CPU time for 2**5 == 32 packets, possibly more. This commit takes loop suppression a step further, by clearing the actions of flows that are implicated in loops. Thus, after the first packet in such a flow, later packets for either the "root" flow or for flows that it ends up looping through are simply discarded, saving a huge amount of CPU time. This version of the commit just clears the actions from the flows that a part of the loop. Probably, there should be some additional action to tell ovs-vswitchd that a loop has been detected, so that it can in turn inform the controller one way or another. My test case was this: ovs-controller -H --max-idle=permanent punix:/tmp/controller ovs-vsctl -- \ set-controller br0 unix:/tmp/controller -- \ add-port br0 patch00 -- \ add-port br0 patch01 -- \ add-port br0 patch10 -- \ add-port br0 patch11 -- \ add-port br0 patch20 -- \ add-port br0 patch21 -- \ add-port br0 patch30 -- \ add-port br0 patch31 -- \ set Interface patch00 type=patch options:peer=patch01 -- \ set Interface patch01 type=patch options:peer=patch00 -- \ set Interface patch10 type=patch options:peer=patch11 -- \ set Interface patch11 type=patch options:peer=patch10 -- \ set Interface patch20 type=patch options:peer=patch21 -- \ set Interface patch21 type=patch options:peer=patch20 -- \ set Interface patch30 type=patch options:peer=patch31 -- \ set Interface patch31 type=patch options:peer=patch30 followed by sending a single "ping" packet from an attached Ethernet port into the bridge. After this, without this commit the vswitch userspace and kernel consume 50-75% of the machine's CPU (in my KVM test setup on a single physical host); with this commit, some CPU is consumed initially but it converges on 0% quickly. A more challenging test sends a series of packets in multiple flows; I used "hping3" with its default options. Without this commit, the vswitch consumes 100% of the machine's CPU, most of which is in the kernel. With this commit, the vswitch consumes "only" 33-50% CPU, most of which is in userspace, so the machine is more responsive. A refinement on this commit would be to pass the loop counter down to userspace as part of the odp_msg struct and then back up as part of the ODP_EXECUTE command arguments. This would, presumably, reduce the CPU requirements, since it would allow loop detection to happen earlier, during initial setup of flows, instead of just on the second and subsequent packets of flows.
2010-08-03 14:40:29 -07:00
/* Update datapath statistics. */
u64_stats_update_begin(&stats->syncp);
(*stats_counter)++;
stats->n_mask_hit += n_mask_hit;
u64_stats_update_end(&stats->syncp);
}
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct dp_stats_percpu *stats;
int err;
if (upcall_info->portid == 0) {
err = -ENOTCONN;
goto err;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
if (!skb_is_gso(skb))
err = queue_userspace_packet(dp, skb, key, upcall_info);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
else
err = queue_gso_packets(dp, skb, key, upcall_info);
if (err)
goto err;
return 0;
err:
stats = this_cpu_ptr(dp->stats_percpu);
u64_stats_update_begin(&stats->syncp);
stats->n_lost++;
u64_stats_update_end(&stats->syncp);
return err;
}
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
const struct dp_upcall_info *upcall_info)
{
unsigned short gso_type = skb_shinfo(skb)->gso_type;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
struct ovs_skb_cb ovs_cb;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
int err;
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
ovs_cb = *OVS_CB(skb);
segs = __skb_gso_segment(skb, NETIF_F_SG, false);
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
*OVS_CB(skb) = ovs_cb;
if (IS_ERR(segs))
return PTR_ERR(segs);
if (segs == NULL)
return -EINVAL;
if (gso_type & SKB_GSO_UDP) {
/* The initial flow key extracted by ovs_flow_key_extract()
* in this case is for a first fragment, so we need to
* properly mark later fragments.
*/
later_key = *key;
later_key.ip.frag = OVS_FRAG_TYPE_LATER;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Queue all of the segments. */
skb = segs;
do {
datapath: Restore OVS_CB after skb_segment. OVS needs to segments large skb before sending it for miss packet handling to userspace. but skb_gso_segment uses skb->cb. This corrupted OVS_CB which result in following panic. [ 735.419921] BUG: unable to handle kernel paging request at 00000014000001b2 [ 735.423168] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 735.445097] RIP: 0010:[<ffffffffa05df0d7>] [<ffffffffa05df0d7>] ovs_nla_put_flow+0x37/0x7c0 [openvswitch] [ 735.468858] Call Trace: [ 735.470384] [<ffffffffa05d7ec2>] queue_userspace_packet+0x332/0x4d0 [openvswitch] [ 735.471741] [<ffffffffa05d8155>] queue_gso_packets+0xf5/0x180 [openvswitch] [ 735.481862] [<ffffffffa05da9f5>] ovs_dp_upcall+0x65/0x70 [openvswitch] [ 735.483031] [<ffffffffa05dab81>] ovs_dp_process_packet+0x181/0x1b0 [openvswitch] [ 735.484391] [<ffffffffa05e2f55>] ovs_vport_receive+0x65/0x90 [openvswitch] [ 735.492638] [<ffffffffa05e5738>] internal_dev_xmit+0x68/0x110 [openvswitch] [ 735.495334] [<ffffffff81588eb6>] dev_hard_start_xmit+0x2e6/0x8b0 [ 735.496503] [<ffffffff81589847>] __dev_queue_xmit+0x3c7/0x920 [ 735.499827] [<ffffffff81589db0>] dev_queue_xmit+0x10/0x20 [ 735.500798] [<ffffffff815d3b60>] ip_finish_output+0x6a0/0x950 [ 735.502818] [<ffffffff815d55f8>] ip_output+0x68/0x110 [ 735.503835] [<ffffffff815d4979>] ip_local_out+0x29/0x90 [ 735.504801] [<ffffffff815d4e46>] ip_queue_xmit+0x1d6/0x640 [ 735.507015] [<ffffffff815ee0d7>] tcp_transmit_skb+0x477/0xac0 [ 735.508260] [<ffffffff815ee856>] tcp_write_xmit+0x136/0xba0 [ 735.510829] [<ffffffff815ef56e>] __tcp_push_pending_frames+0x2e/0xc0 [ 735.512296] [<ffffffff815e0593>] tcp_sendmsg+0xa63/0xd50 [ 735.513526] [<ffffffff81612c2c>] inet_sendmsg+0x10c/0x220 [ 735.516025] [<ffffffff81566b8c>] sock_sendmsg+0x9c/0xe0 [ 735.518066] [<ffffffff81566d41>] SYSC_sendto+0x121/0x1c0 [ 735.521398] [<ffffffff8156801e>] SyS_sendto+0xe/0x10 [ 735.522473] [<ffffffff816df5e9>] system_call_fastpath+0x16/0x1b Reported-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com>
2014-09-20 21:10:49 -07:00
*OVS_CB(skb) = ovs_cb;
if (gso_type & SKB_GSO_UDP && skb != segs)
key = &later_key;
err = queue_userspace_packet(dp, skb, key, upcall_info);
if (err)
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
break;
datapath: Report kernel's flow key when passing packets up to userspace. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. This commit takes one step in that direction by making the kernel report its idea of the flow that a packet belongs to whenever it passes a packet up to userspace. This means that userspace can intelligently figure out what to do: - If userspace's notion of the flow for the packet matches the kernel's, then nothing special is necessary. - If the kernel has a more specific notion for the flow than userspace, for example if the kernel decoded IPv6 headers but userspace stopped at the Ethernet type (because it does not understand IPv6), then again nothing special is necessary: userspace can still set up the flow in the usual way. - If userspace has a more specific notion for the flow than the kernel, for example if userspace decoded an IPv6 header but the kernel stopped at the Ethernet type, then userspace can forward the packet manually, without setting up a flow in the kernel. (This case is bad from a performance point of view, but at least it is correct.) This commit does not actually make userspace flexible enough to handle changes in the kernel flow key structure, although userspace does now have enough information to do that intelligently. This will have to wait for later commits. This commit is bigger than it would otherwise be because it is rolled together with changing "struct odp_msg" to a sequence of Netlink attributes. The alternative, to do each of those changes in a separate patch, seemed like overkill because it meant that either we would have to introduce and then kill off Netlink attributes for in_port and tun_id, if Netlink conversion went first, or shove yet another variable-length header into the stuff already after odp_msg, if adding the flow key to odp_msg went first. This commit will slow down performance of checksumming packets sent up to userspace. I'm not entirely pleased with how I did it. I considered a couple of alternatives, but none of them seemed that much better. Suggestions welcome. Not changing anything wasn't an option, unfortunately. At any rate some slowdown will become unavoidable when OVS actually starts using Netlink instead of just Netlink framing. (Actually, I thought of one option where we could avoid that: make userspace do the checksum instead, by passing csum_start and csum_offset as part of what goes to userspace. But that's not perfect either.) Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-24 14:59:57 -08:00
} while ((skb = skb->next));
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Free all of the segments. */
skb = segs;
do {
nskb = skb->next;
if (err)
kfree_skb(skb);
else
consume_skb(skb);
} while ((skb = nskb));
return err;
}
static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
unsigned int hdrlen)
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
/* OVS_PACKET_ATTR_USERDATA */
if (upcall_info->userdata)
size += NLA_ALIGN(upcall_info->userdata->nla_len);
/* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
if (upcall_info->egress_tun_info)
size += nla_total_size(ovs_tun_key_attr_size());
/* OVS_PACKET_ATTR_ACTIONS */
if (upcall_info->actions_len)
size += nla_total_size(upcall_info->actions_len);
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
/* OVS_PACKET_ATTR_MRU */
if (upcall_info->mru)
size += nla_total_size(sizeof(upcall_info->mru));
return size;
}
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
static void pad_packet(struct datapath *dp, struct sk_buff *skb)
{
if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
size_t plen = NLA_ALIGN(skb->len) - skb->len;
if (plen > 0)
memset(skb_put(skb, plen), 0, plen);
}
}
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
struct nlattr *nla;
struct genl_info info = {
#ifdef HAVE_GENLMSG_NEW_UNICAST
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
.snd_portid = upcall_info->portid,
};
size_t len;
unsigned int hlen;
int err, dp_ifindex;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
return -ENODEV;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
if (skb_vlan_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, skb_vlan_tag_get(nskb));
if (!nskb)
return -ENOMEM;
vlan_set_tci(nskb, 0);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic. Only perform zerocopy if padding is not required.
*/
if (dp->user_features & OVS_DP_F_UNALIGNED)
hlen = skb_zerocopy_headlen(skb);
else
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
}
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);
BUG_ON(err);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
err = ovs_nla_put_egress_tunnel_key(user_skb,
upcall_info->egress_tun_info,
upcall_info->egress_tun_opts);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
if (upcall_info->actions_len) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);
err = ovs_nla_put_actions(upcall_info->actions,
upcall_info->actions_len,
user_skb);
if (!err)
nla_nest_end(user_skb, nla);
else
nla_nest_cancel(user_skb, nla);
}
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
/* Add OVS_PACKET_ATTR_MRU */
if (upcall_info->mru) {
if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
upcall_info->mru)) {
err = -ENOBUFS;
goto out;
}
pad_packet(dp, user_skb);
}
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy()
*/
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
nla->nla_len = nla_attr_size(skb->len);
err = skb_zerocopy(user_skb, skb, skb->len, hlen);
if (err)
goto out;
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
pad_packet(dp, user_skb);
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
{
struct ovs_header *ovs_header = info->userhdr;
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct sw_flow_actions *acts;
struct sk_buff *packet;
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct datapath *dp;
struct ethhdr *eth;
struct vport *input_vport;
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
u16 mru = 0;
int len;
int err;
bool log = !a[OVS_PACKET_ATTR_PROBE];
err = -EINVAL;
if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
!a[OVS_PACKET_ATTR_ACTIONS])
goto err;
len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
err = -ENOMEM;
if (!packet)
goto err;
skb_reserve(packet, NET_IP_ALIGN);
nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
skb_reset_mac_header(packet);
eth = eth_hdr(packet);
/* Normally, setting the skb 'protocol' field would be handled by a
* call to eth_type_trans(), but it assumes there's a sending
* device, which we may not have.
*/
if (eth_proto_is_802_3(eth->h_proto))
packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
/* Set packet's mru */
if (a[OVS_PACKET_ATTR_MRU]) {
mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
packet->ignore_df = 1;
}
OVS_CB(packet)->mru = mru;
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
if (IS_ERR(flow))
goto err_kfree_skb;
err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet,
&flow->key, log);
if (err)
goto err_flow_free;
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
&flow->key, &acts, log);
if (err)
goto err_flow_free;
rcu_assign_pointer(flow->sf_acts, acts);
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
rcu_read_lock();
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
dp = get_dp_rcu(net, ovs_header->dp_ifindex);
err = -ENODEV;
if (!dp)
goto err_unlock;
input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
if (!input_vport)
input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);
if (!input_vport)
goto err_unlock;
packet->dev = input_vport->dev;
OVS_CB(packet)->input_vport = input_vport;
sf_acts = rcu_dereference(flow->sf_acts);
local_bh_disable();
err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
local_bh_enable();
rcu_read_unlock();
ovs_flow_free(flow, false);
return err;
err_unlock:
rcu_read_unlock();
err_flow_free:
ovs_flow_free(flow, false);
err_kfree_skb:
kfree_skb(packet);
err:
return err;
}
static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
[OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
};
static struct genl_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = packet_policy,
.doit = ovs_packet_cmd_execute
}
};
static struct genl_family dp_packet_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_PACKET_FAMILY,
.version = OVS_PACKET_VERSION,
.maxattr = OVS_PACKET_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_packet_genl_ops,
.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
};
static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
struct ovs_dp_megaflow_stats *mega_stats)
{
int i;
memset(mega_stats, 0, sizeof(*mega_stats));
stats->n_flows = ovs_flow_tbl_count(&dp->table);
mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
Implement new fragment handling policy. Until now, OVS has handled IP fragments more awkwardly than necessary. It has not been possible to match on L4 headers, even in fragments with offset 0 where they are actually present. This means that there was no way to implement ACLs that treat, say, different TCP ports differently, on fragmented traffic; instead, all decisions for fragment forwarding had to be made on the basis of L2 and L3 headers alone. This commit improves the situation significantly. It is still not possible to match on L4 headers in fragments with nonzero offset, because that information is simply not present in such fragments, but this commit adds the ability to match on L4 headers for fragments with zero offset. This means that it becomes possible to implement ACLs that drop such "first fragments" on the basis of L4 headers. In practice, that effectively blocks even fragmented traffic on an L4 basis, because the receiving IP stack cannot reassemble a full packet when the first fragment is missing. This commit works by adding a new "fragment type" to the kernel flow match and making it available through OpenFlow as a new NXM field named NXM_NX_IP_FRAG. Because OpenFlow 1.0 explicitly says that the L4 fields are always 0 for IP fragments, it adds a new OpenFlow fragment handling mode that fills in the L4 fields for "first fragments". It also enhances ovs-ofctl to allow users to configure this new fragment handling mode and to parse the new field. Signed-off-by: Ben Pfaff <blp@nicira.com> Bug #7557.
2011-10-19 21:33:44 -07:00
stats->n_hit = stats->n_missed = stats->n_lost = 0;
for_each_possible_cpu(i) {
const struct dp_stats_percpu *percpu_stats;
struct dp_stats_percpu local_stats;
unsigned int start;
percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
do {
start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
local_stats = *percpu_stats;
} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
stats->n_lost += local_stats.n_lost;
mega_stats->n_mask_hit += local_stats.n_mask_hit;
}
}
static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags)
{
return ovs_identifier_is_ufid(sfid) &&
!(ufid_flags & OVS_UFID_F_OMIT_KEY);
}
static bool should_fill_mask(uint32_t ufid_flags)
{
return !(ufid_flags & OVS_UFID_F_OMIT_MASK);
}
static bool should_fill_actions(uint32_t ufid_flags)
{
return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS);
}
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
const struct sw_flow_id *sfid,
uint32_t ufid_flags)
{
size_t len = NLMSG_ALIGN(sizeof(struct ovs_header));
/* OVS_FLOW_ATTR_UFID */
if (sfid && ovs_identifier_is_ufid(sfid))
len += nla_total_size(sfid->ufid_len);
/* OVS_FLOW_ATTR_KEY */
if (!sfid || should_fill_key(sfid, ufid_flags))
len += nla_total_size(ovs_key_attr_size());
/* OVS_FLOW_ATTR_MASK */
if (should_fill_mask(ufid_flags))
len += nla_total_size(ovs_key_attr_size());
/* OVS_FLOW_ATTR_ACTIONS */
if (should_fill_actions(ufid_flags))
len += nla_total_size(acts->orig_len);
return len
+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
+ nla_total_size(8); /* OVS_FLOW_ATTR_USED */
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
struct sk_buff *skb)
{
struct ovs_flow_stats stats;
__be16 tcp_flags;
unsigned long used;
ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
if (used &&
nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
return -EMSGSIZE;
if (stats.n_packets &&
nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
return -EMSGSIZE;
if ((u8)ntohs(tcp_flags) &&
nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
return -EMSGSIZE;
return 0;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
struct sk_buff *skb, int skb_orig_len)
{
struct nlattr *start;
int err;
/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
* this is the first flow to be dumped into 'skb'. This is unusual for
* Netlink but individual action lists can be longer than
* NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
* The userspace caller can always fetch the actions separately if it
* really wants them. (Most userspace callers in fact don't care.)
*
* This can only fail for dump operations because the skb is always
* properly sized for single flows.
*/
start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
if (start) {
const struct sw_flow_actions *sf_acts;
sf_acts = rcu_dereference_ovsl(flow->sf_acts);
err = ovs_nla_put_actions(sf_acts->actions,
sf_acts->actions_len, skb);
if (!err)
nla_nest_end(skb, start);
else {
if (skb_orig_len)
return err;
nla_nest_cancel(skb, start);
}
} else if (skb_orig_len) {
return -EMSGSIZE;
}
return 0;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
struct sk_buff *skb, u32 portid,
u32 seq, u32 flags, u8 cmd, u32 ufid_flags)
{
const int skb_orig_len = skb->len;
struct ovs_header *ovs_header;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
flags, cmd);
if (!ovs_header)
return -EMSGSIZE;
ovs_header->dp_ifindex = dp_ifindex;
err = ovs_nla_put_identifier(flow, skb);
if (err)
goto error;
if (should_fill_key(&flow->id, ufid_flags)) {
err = ovs_nla_put_masked_key(flow, skb);
if (err)
goto error;
}
if (should_fill_mask(ufid_flags)) {
err = ovs_nla_put_mask(flow, skb);
if (err)
goto error;
}
err = ovs_flow_cmd_fill_stats(flow, skb);
if (err)
goto error;
if (should_fill_actions(ufid_flags)) {
err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
if (err)
goto error;
}
genlmsg_end(skb, ovs_header);
return 0;
error:
genlmsg_cancel(skb, ovs_header);
return err;
}
/* May not be called with RCU read lock. */
static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
const struct sw_flow_id *sfid,
struct genl_info *info,
bool always,
uint32_t ufid_flags)
{
struct sk_buff *skb;
size_t len;
if (!always && !ovs_must_notify(&dp_flow_genl_family, info,
GROUP_ID(&ovs_dp_flow_multicast_group)))
return NULL;
len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
skb = genlmsg_new_unicast(len, info, GFP_KERNEL);
if (!skb)
return ERR_PTR(-ENOMEM);
return skb;
}
/* Called with ovs_mutex. */
static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
int dp_ifindex,
struct genl_info *info, u8 cmd,
bool always, u32 ufid_flags)
{
struct sk_buff *skb;
int retval;
skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts),
&flow->id, info, always, ufid_flags);
if (IS_ERR_OR_NULL(skb))
return skb;
retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
info->snd_portid, info->snd_seq, 0,
cmd, ufid_flags);
BUG_ON(retval < 0);
return skb;
}
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow *flow = NULL, *new_flow;
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
struct sw_flow_key key;
struct sw_flow_actions *acts;
struct sw_flow_match match;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
int error;
bool log = !a[OVS_FLOW_ATTR_PROBE];
/* Must have key and actions. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY]) {
OVS_NLERR(log, "Flow key attr not present in new flow.");
goto error;
}
if (!a[OVS_FLOW_ATTR_ACTIONS]) {
OVS_NLERR(log, "Flow actions attr not present in new flow.");
goto error;
}
/* Most of the time we need to allocate a new flow, do it before
* locking.
*/
new_flow = ovs_flow_alloc();
if (IS_ERR(new_flow)) {
error = PTR_ERR(new_flow);
goto error;
}
/* Extract key. */
ovs_match_init(&match, &key, &mask);
error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
datapath: Backport "openvswitch: Zero flows on allocation." Upstream commit: openvswitch: Zero flows on allocation. When support for megaflows was introduced, OVS needed to start installing flows with a mask applied to them. Since masking is an expensive operation, OVS also had an optimization that would only take the parts of the flow keys that were covered by a non-zero mask. The values stored in the remaining pieces should not matter because they are masked out. While this works fine for the purposes of matching (which must always look at the mask), serialization to netlink can be problematic. Since the flow and the mask are serialized separately, the uninitialized portions of the flow can be encoded with whatever values happen to be present. In terms of functionality, this has little effect since these fields will be masked out by definition. However, it leaks kernel memory to userspace, which is a potential security vulnerability. It is also possible that other code paths could look at the masked key and get uninitialized data, although this does not currently appear to be an issue in practice. This removes the mask optimization for flows that are being installed. This was always intended to be the case as the mask optimizations were really targetting per-packet flow operations. Fixes: 03f0d916 ("openvswitch: Mega flow implementation") Signed-off-by: Jesse Gross <jesse@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net> Upstream: ae5f2fb1 ("openvswitch: Zero flows on allocation.") Signed-off-by: Jesse Gross <jesse@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-09-22 18:13:00 -07:00
ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
/* Extract flow identifier. */
error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
&key, log);
if (error)
goto err_kfree_flow;
/* Validate actions. */
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
&new_flow->key, &acts, log);
if (error) {
OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
goto err_kfree_flow;
}
reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false,
ufid_flags);
if (IS_ERR(reply)) {
error = PTR_ERR(reply);
goto err_kfree_acts;
}
ovs_lock();
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
dp = get_dp(net, ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
}
/* Check if this is a duplicate flow */
if (ovs_identifier_is_ufid(&new_flow->id))
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
if (!flow)
flow = ovs_flow_tbl_lookup(&dp->table, &key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
/* Put flow in bucket. */
error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
if (unlikely(error)) {
acts = NULL;
goto err_unlock_ovs;
}
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(new_flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW,
ufid_flags);
BUG_ON(error < 0);
}
ovs_unlock();
} else {
struct sw_flow_actions *old_acts;
/* Bail out if we're not allowed to modify an existing flow.
* We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
* because Generic Netlink treats the latter as a dump
* request. We also accept NLM_F_EXCL in case that bug ever
* gets fixed.
*/
if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
| NLM_F_EXCL))) {
error = -EEXIST;
goto err_unlock_ovs;
}
/* The flow identifier has to be the same for flow updates.
* Look for any overlapping flow.
*/
if (unlikely(!ovs_flow_cmp(flow, &match))) {
if (ovs_identifier_is_key(&flow->id))
flow = ovs_flow_tbl_lookup_exact(&dp->table,
&match);
else /* UFID matches but key is different */
flow = NULL;
if (!flow) {
error = -ENOENT;
goto err_unlock_ovs;
}
}
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW,
ufid_flags);
BUG_ON(error < 0);
}
ovs_unlock();
ovs_nla_free_flow_actions_rcu(old_acts);
ovs_flow_free(new_flow, false);
}
if (reply)
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
return 0;
err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
ovs_nla_free_flow_actions(acts);
err_kfree_flow:
ovs_flow_free(new_flow, false);
error:
return error;
}
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
static struct sw_flow_actions *get_flow_actions(struct net *net,
const struct nlattr *a,
const struct sw_flow_key *key,
const struct sw_flow_mask *mask,
bool log)
{
struct sw_flow_actions *acts;
struct sw_flow_key masked_key;
int error;
datapath: Backport "openvswitch: Zero flows on allocation." Upstream commit: openvswitch: Zero flows on allocation. When support for megaflows was introduced, OVS needed to start installing flows with a mask applied to them. Since masking is an expensive operation, OVS also had an optimization that would only take the parts of the flow keys that were covered by a non-zero mask. The values stored in the remaining pieces should not matter because they are masked out. While this works fine for the purposes of matching (which must always look at the mask), serialization to netlink can be problematic. Since the flow and the mask are serialized separately, the uninitialized portions of the flow can be encoded with whatever values happen to be present. In terms of functionality, this has little effect since these fields will be masked out by definition. However, it leaks kernel memory to userspace, which is a potential security vulnerability. It is also possible that other code paths could look at the masked key and get uninitialized data, although this does not currently appear to be an issue in practice. This removes the mask optimization for flows that are being installed. This was always intended to be the case as the mask optimizations were really targetting per-packet flow operations. Fixes: 03f0d916 ("openvswitch: Mega flow implementation") Signed-off-by: Jesse Gross <jesse@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net> Upstream: ae5f2fb1 ("openvswitch: Zero flows on allocation.") Signed-off-by: Jesse Gross <jesse@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-09-22 18:13:00 -07:00
ovs_flow_mask_key(&masked_key, key, true, mask);
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log);
if (error) {
OVS_NLERR(log,
"Actions may not be safe on all matching packets");
return ERR_PTR(error);
}
return acts;
}
static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
struct sw_flow_mask mask;
struct sk_buff *reply = NULL;
struct datapath *dp;
struct sw_flow_actions *old_acts = NULL, *acts = NULL;
struct sw_flow_match match;
struct sw_flow_id sfid;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
int error;
bool log = !a[OVS_FLOW_ATTR_PROBE];
bool ufid_present;
/* Extract key. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY]) {
OVS_NLERR(log, "Flow key attribute not present in set flow.");
goto error;
}
ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
ovs_match_init(&match, &key, &mask);
error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto error;
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
&mask, log);
if (IS_ERR(acts)) {
error = PTR_ERR(acts);
goto error;
}
/* Can allocate before locking if have acts. */
reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false,
ufid_flags);
if (IS_ERR(reply)) {
error = PTR_ERR(reply);
goto err_kfree_acts;
}
}
ovs_lock();
datapath: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Upstream: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Justin Pettit <jpettit@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-12-02 23:53:50 -08:00
dp = get_dp(net, ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
}
/* Check that the flow exists. */
if (ufid_present)
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid);
else
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
if (unlikely(!flow)) {
error = -ENOENT;
goto err_unlock_ovs;
}
/* Update actions, if present. */
if (likely(acts)) {
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW,
ufid_flags);
BUG_ON(error < 0);
}
} else {
/* Could not alloc without acts before locking. */
reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
info, OVS_FLOW_CMD_NEW, false,
ufid_flags);
if (unlikely(IS_ERR(reply))) {
error = PTR_ERR(reply);
goto err_unlock_ovs;
}
}
/* Clear stats. */
if (a[OVS_FLOW_ATTR_CLEAR])
ovs_flow_stats_clear(flow);
ovs_unlock();
if (reply)
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
if (old_acts)
ovs_nla_free_flow_actions_rcu(old_acts);
return 0;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
ovs_nla_free_flow_actions(acts);
error:
return error;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
}
static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sk_buff *reply;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
struct sw_flow *flow;
struct datapath *dp;
struct sw_flow_match match;
struct sw_flow_id ufid;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
int err = 0;
bool log = !a[OVS_FLOW_ATTR_PROBE];
bool ufid_present;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
ovs_match_init(&match, &key, NULL);
err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
log);
} else if (!ufid_present) {
OVS_NLERR(log,
"Flow get message rejected, Key attribute missing.");
err = -EINVAL;
}
if (err)
return err;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
if (!dp) {
err = -ENODEV;
goto unlock;
}
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
if (ufid_present)
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
else
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
if (!flow) {
err = -ENOENT;
goto unlock;
}
reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
OVS_FLOW_CMD_NEW, true, ufid_flags);
if (IS_ERR(reply)) {
err = PTR_ERR(reply);
goto unlock;
}
ovs_unlock();
return genlmsg_reply(reply, info);
unlock:
ovs_unlock();
return err;
}
static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sk_buff *reply;
struct sw_flow *flow = NULL;
struct datapath *dp;
struct sw_flow_match match;
struct sw_flow_id ufid;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
int err;
bool log = !a[OVS_FLOW_ATTR_PROBE];
bool ufid_present;
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
ovs_match_init(&match, &key, NULL);
err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
log);
if (unlikely(err))
return err;
}
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
if (unlikely(!dp)) {
err = -ENODEV;
goto unlock;
}
if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
err = ovs_flow_tbl_flush(&dp->table);
goto unlock;
}
if (ufid_present)
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
else
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
if (unlikely(!flow)) {
err = -ENOENT;
goto unlock;
}
ovs_flow_tbl_remove(&dp->table, flow);
ovs_unlock();
reply = ovs_flow_cmd_alloc_info(rcu_dereference_raw(flow->sf_acts),
&flow->id, info, false, ufid_flags);
if (likely(reply)) {
if (likely(!IS_ERR(reply))) {
rcu_read_lock(); /*To keep RCU checker happy. */
err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_DEL,
ufid_flags);
rcu_read_unlock();
BUG_ON(err < 0);
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
} else {
genl_set_err(&dp_flow_genl_family, sock_net(skb->sk), 0,
GROUP_ID(&ovs_dp_flow_multicast_group), PTR_ERR(reply));
}
}
ovs_flow_free(flow, true);
return 0;
unlock:
ovs_unlock();
return err;
}
static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nlattr *a[__OVS_FLOW_ATTR_MAX];
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
struct table_instance *ti;
struct datapath *dp;
u32 ufid_flags;
int err;
err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a,
OVS_FLOW_ATTR_MAX, flow_policy);
if (err)
return err;
ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
rcu_read_lock();
dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
if (!dp) {
rcu_read_unlock();
return -ENODEV;
}
ti = rcu_dereference(dp->table.ti);
for (;;) {
struct sw_flow *flow;
u32 bucket, obj;
bucket = cb->args[0];
obj = cb->args[1];
flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
if (!flow)
break;
if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
OVS_FLOW_CMD_NEW, ufid_flags) < 0)
break;
cb->args[0] = bucket;
cb->args[1] = obj;
}
rcu_read_unlock();
return skb->len;
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
}
static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
[OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
[OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG },
[OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 },
[OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 },
};
static struct genl_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_new
},
{ .cmd = OVS_FLOW_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_del
},
{ .cmd = OVS_FLOW_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = flow_policy,
.doit = ovs_flow_cmd_get,
.dumpit = ovs_flow_cmd_dump
},
{ .cmd = OVS_FLOW_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_set,
},
};
static struct genl_family dp_flow_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
.maxattr = OVS_FLOW_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_flow_genl_ops,
.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
};
static size_t ovs_dp_cmd_msg_size(void)
{
size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
msgsize += nla_total_size(IFNAMSIZ);
msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
return msgsize;
}
/* Called with ovs_mutex. */
static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
u32 portid, u32 seq, u32 flags, u8 cmd)
{
struct ovs_header *ovs_header;
struct ovs_dp_stats dp_stats;
struct ovs_dp_megaflow_stats dp_megaflow_stats;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
flags, cmd);
if (!ovs_header)
goto error;
ovs_header->dp_ifindex = get_dpifindex(dp);
err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
if (err)
goto nla_put_failure;
get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
&dp_stats))
goto nla_put_failure;
if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
sizeof(struct ovs_dp_megaflow_stats),
&dp_megaflow_stats))
goto nla_put_failure;
if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
goto nla_put_failure;
genlmsg_end(skb, ovs_header);
return 0;
nla_put_failure:
genlmsg_cancel(skb, ovs_header);
error:
return -EMSGSIZE;
}
static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
{
return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL);
}
/* Called with rcu_read_lock or ovs_mutex. */
static struct datapath *lookup_datapath(struct net *net,
const struct ovs_header *ovs_header,
struct nlattr *a[OVS_DP_ATTR_MAX + 1])
{
struct datapath *dp;
if (!a[OVS_DP_ATTR_NAME])
dp = get_dp(net, ovs_header->dp_ifindex);
else {
struct vport *vport;
vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
}
return dp ? dp : ERR_PTR(-ENODEV);
}
static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
{
struct datapath *dp;
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
if (IS_ERR(dp))
return;
WARN(dp->user_features, "Dropping previously announced user features\n");
dp->user_features = 0;
}
static void ovs_dp_change(struct datapath *dp, struct nlattr *a[])
{
if (a[OVS_DP_ATTR_USER_FEATURES])
dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
}
static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct vport_parms parms;
struct sk_buff *reply;
struct datapath *dp;
struct vport *vport;
struct ovs_net *ovs_net;
int err, i;
err = -EINVAL;
if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
goto err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
err = -ENOMEM;
dp = kzalloc(sizeof(*dp), GFP_KERNEL);
if (dp == NULL)
goto err_free_reply;
ovs_dp_set_net(dp, sock_net(skb->sk));
/* Allocate table. */
err = ovs_flow_tbl_init(&dp->table);
if (err)
goto err_free_dp;
dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
if (!dp->stats_percpu) {
err = -ENOMEM;
goto err_destroy_table;
}
dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
GFP_KERNEL);
if (!dp->ports) {
err = -ENOMEM;
goto err_destroy_percpu;
}
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(&dp->ports[i]);
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
parms.options = NULL;
parms.dp = dp;
parms.port_no = OVSP_LOCAL;
parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
ovs_dp_change(dp, a);
/* So far only local changes have been made, now need the lock. */
ovs_lock();
vport = new_vport(&parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
if (err == -EBUSY)
err = -EEXIST;
if (err == -EEXIST) {
/* An outdated user space instance that does not understand
* the concept of user_features has attempted to create a new
* datapath and is likely to reuse it. Drop all user features.
*/
if (info->genlhdr->version < OVS_DP_VER_FEATURES)
ovs_dp_reset_user_features(skb, info);
}
goto err_destroy_ports_array;
}
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_NEW);
BUG_ON(err < 0);
ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
ovs_unlock();
ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
return 0;
err_destroy_ports_array:
ovs_unlock();
kfree(dp->ports);
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
ovs_flow_tbl_destroy(&dp->table);
err_free_dp:
kfree(dp);
err_free_reply:
kfree_skb(reply);
err:
return err;
}
/* Called with ovs_mutex. */
static void __dp_destroy(struct datapath *dp)
{
int i;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
struct hlist_node *n;
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
if (vport->port_no != OVSP_LOCAL)
ovs_dp_detach_port(vport);
}
list_del_rcu(&dp->list_node);
/* OVSP_LOCAL is datapath internal port. We need to make sure that
* all ports in datapath are destroyed first before freeing datapath.
*/
ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
/* RCU destroy the flow table */
call_rcu(&dp->rcu, destroy_dp_rcu);
}
static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *reply;
struct datapath *dp;
int err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
ovs_lock();
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
err = PTR_ERR(dp);
if (IS_ERR(dp))
goto err_unlock_free;
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_DEL);
BUG_ON(err < 0);
__dp_destroy(dp);
ovs_unlock();
ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
return 0;
err_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *reply;
struct datapath *dp;
int err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
ovs_lock();
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
err = PTR_ERR(dp);
if (IS_ERR(dp))
goto err_unlock_free;
ovs_dp_change(dp, info->attrs);
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_NEW);
BUG_ON(err < 0);
ovs_unlock();
ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
return 0;
err_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *reply;
struct datapath *dp;
int err;
reply = ovs_dp_cmd_alloc_info(info);
if (!reply)
return -ENOMEM;
ovs_lock();
dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
if (IS_ERR(dp)) {
err = PTR_ERR(dp);
goto err_unlock_free;
}
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_NEW);
BUG_ON(err < 0);
ovs_unlock();
return genlmsg_reply(reply, info);
err_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
struct datapath *dp;
int skip = cb->args[0];
int i = 0;
ovs_lock();
list_for_each_entry(dp, &ovs_net->dps, list_node) {
if (i >= skip &&
ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
OVS_DP_CMD_NEW) < 0)
break;
i++;
}
ovs_unlock();
cb->args[0] = i;
return skb->len;
}
static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
[OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
};
static struct genl_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_new
},
{ .cmd = OVS_DP_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_del
},
{ .cmd = OVS_DP_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_get,
.dumpit = ovs_dp_cmd_dump
},
{ .cmd = OVS_DP_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_set,
},
};
static struct genl_family dp_datapath_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
.maxattr = OVS_DP_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_datapath_genl_ops,
.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
};
/* Called with ovs_mutex or RCU read lock. */
static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
u32 portid, u32 seq, u32 flags, u8 cmd)
{
struct ovs_header *ovs_header;
struct ovs_vport_stats vport_stats;
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
flags, cmd);
if (!ovs_header)
return -EMSGSIZE;
ovs_header->dp_ifindex = get_dpifindex(vport->dp);
if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
nla_put_string(skb, OVS_VPORT_ATTR_NAME,
ovs_vport_name(vport)))
goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
&vport_stats))
goto nla_put_failure;
if (ovs_vport_get_upcall_portids(vport, skb))
goto nla_put_failure;
err = ovs_vport_get_options(vport, skb);
if (err == -EMSGSIZE)
goto error;
genlmsg_end(skb, ovs_header);
return 0;
nla_put_failure:
err = -EMSGSIZE;
error:
genlmsg_cancel(skb, ovs_header);
return err;
}
static struct sk_buff *ovs_vport_cmd_alloc_info(void)
{
return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
}
/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
u32 seq, u8 cmd)
{
struct sk_buff *skb;
int retval;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb)
return ERR_PTR(-ENOMEM);
retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
BUG_ON(retval < 0);
return skb;
}
/* Called with ovs_mutex or RCU read lock. */
static struct vport *lookup_vport(struct net *net,
const struct ovs_header *ovs_header,
struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
{
struct datapath *dp;
struct vport *vport;
if (a[OVS_VPORT_ATTR_NAME]) {
vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
if (!vport)
return ERR_PTR(-ENODEV);
if (ovs_header->dp_ifindex &&
ovs_header->dp_ifindex != get_dpifindex(vport->dp))
return ERR_PTR(-ENODEV);
return vport;
} else if (a[OVS_VPORT_ATTR_PORT_NO]) {
u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
if (port_no >= DP_MAX_PORTS)
return ERR_PTR(-EFBIG);
dp = get_dp(net, ovs_header->dp_ifindex);
if (!dp)
return ERR_PTR(-ENODEV);
vport = ovs_vport_ovsl_rcu(dp, port_no);
if (!vport)
return ERR_PTR(-ENODEV);
return vport;
} else
return ERR_PTR(-EINVAL);
}
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct vport_parms parms;
struct sk_buff *reply;
struct vport *vport;
struct datapath *dp;
2011-01-10 13:12:12 -08:00
u32 port_no;
int err;
2011-01-10 13:12:12 -08:00
if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
!a[OVS_VPORT_ATTR_UPCALL_PID])
return -EINVAL;
port_no = a[OVS_VPORT_ATTR_PORT_NO]
? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
if (port_no >= DP_MAX_PORTS)
return -EFBIG;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
ovs_lock();
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
restart:
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
err = -ENODEV;
if (!dp)
goto exit_unlock_free;
if (port_no) {
vport = ovs_vport_ovsl(dp, port_no);
err = -EBUSY;
if (vport)
goto exit_unlock_free;
} else {
for (port_no = 1; ; port_no++) {
if (port_no >= DP_MAX_PORTS) {
err = -EFBIG;
goto exit_unlock_free;
}
vport = ovs_vport_ovsl(dp, port_no);
if (!vport)
break;
}
}
2011-01-10 13:12:12 -08:00
parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
parms.options = a[OVS_VPORT_ATTR_OPTIONS];
parms.dp = dp;
parms.port_no = port_no;
parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
vport = new_vport(&parms);
err = PTR_ERR(vport);
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
if (IS_ERR(vport)) {
if (err == -EAGAIN)
goto restart;
goto exit_unlock_free;
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
}
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
BUG_ON(err < 0);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
return 0;
exit_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct sk_buff *reply;
struct vport *vport;
int err;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
ovs_lock();
vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock_free;
if (a[OVS_VPORT_ATTR_TYPE] &&
nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
err = -EINVAL;
goto exit_unlock_free;
}
if (a[OVS_VPORT_ATTR_OPTIONS]) {
err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
if (err)
goto exit_unlock_free;
}
if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID];
err = ovs_vport_set_upcall_portids(vport, ids);
if (err)
goto exit_unlock_free;
}
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
BUG_ON(err < 0);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
return 0;
exit_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct sk_buff *reply;
struct vport *vport;
int err;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
ovs_lock();
vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock_free;
if (vport->port_no == OVSP_LOCAL) {
err = -EINVAL;
goto exit_unlock_free;
}
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_DEL);
BUG_ON(err < 0);
ovs_dp_detach_port(vport);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
return 0;
exit_unlock_free:
ovs_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sk_buff *reply;
struct vport *vport;
int err;
reply = ovs_vport_cmd_alloc_info();
if (!reply)
return -ENOMEM;
rcu_read_lock();
vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock_free;
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
BUG_ON(err < 0);
rcu_read_unlock();
return genlmsg_reply(reply, info);
exit_unlock_free:
rcu_read_unlock();
kfree_skb(reply);
return err;
}
static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
struct datapath *dp;
int bucket = cb->args[0], skip = cb->args[1];
int i, j = 0;
rcu_read_lock();
dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
if (!dp) {
rcu_read_unlock();
return -ENODEV;
}
for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
j = 0;
datapath: hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Jesse Gross <jesse@nicira.com>
2013-03-14 18:40:32 -07:00
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
if (j >= skip &&
ovs_vport_cmd_fill_info(vport, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
OVS_VPORT_CMD_NEW) < 0)
goto out;
j++;
}
skip = 0;
}
out:
rcu_read_unlock();
cb->args[0] = i;
cb->args[1] = j;
return skb->len;
}
static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
[OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
};
static struct genl_ops dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_new
},
{ .cmd = OVS_VPORT_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_del
},
{ .cmd = OVS_VPORT_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = vport_policy,
.doit = ovs_vport_cmd_get,
.dumpit = ovs_vport_cmd_dump
},
{ .cmd = OVS_VPORT_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_set,
},
};
struct genl_family dp_vport_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_VPORT_FAMILY,
.version = OVS_VPORT_VERSION,
.maxattr = OVS_VPORT_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_vport_genl_ops,
.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
};
static struct genl_family *dp_genl_families[] = {
&dp_datapath_genl_family,
&dp_vport_genl_family,
&dp_flow_genl_family,
&dp_packet_genl_family,
};
static void dp_unregister_genl(int n_families)
{
int i;
for (i = 0; i < n_families; i++)
genl_unregister_family(dp_genl_families[i]);
}
static int dp_register_genl(void)
{
int err;
int i;
for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
err = genl_register_family(dp_genl_families[i]);
if (err)
goto error;
}
return 0;
error:
dp_unregister_genl(i);
return err;
}
static int __net_init ovs_init_net(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
return 0;
}
datapath: Fix net exit. Open vSwitch allows moving internal vport to different namespace while still connected to the bridge. But when namespace deleted OVS does not detach these vports, that results in dangling pointer to netdevice which causes kernel panic as follows. This issue is fixed by detaching all ovs ports from the deleted namespace at net-exit. BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [<ffffffffa0aadaa5>] ovs_vport_locate+0x35/0x80 [openvswitch] Oops: 0000 [#1] SMP Call Trace: [<ffffffffa0aa6391>] lookup_vport+0x21/0xd0 [openvswitch] [<ffffffffa0aa65f9>] ovs_vport_cmd_get+0x59/0xf0 [openvswitch] [<ffffffff8167e07c>] genl_family_rcv_msg+0x1bc/0x3e0 [<ffffffff8167e319>] genl_rcv_msg+0x79/0xc0 [<ffffffff8167d919>] netlink_rcv_skb+0xb9/0xe0 [<ffffffff8167deac>] genl_rcv+0x2c/0x40 [<ffffffff8167cffd>] netlink_unicast+0x12d/0x1c0 [<ffffffff8167d3da>] netlink_sendmsg+0x34a/0x6b0 [<ffffffff8162e140>] sock_sendmsg+0xa0/0xe0 [<ffffffff8162e5e8>] ___sys_sendmsg+0x408/0x420 [<ffffffff8162f541>] __sys_sendmsg+0x51/0x90 [<ffffffff8162f592>] SyS_sendmsg+0x12/0x20 [<ffffffff81764ee9>] system_call_fastpath+0x12/0x17 Reported-by: Assaf Muller <amuller@redhat.com> Fixes: 46df7b81454("openvswitch: Add support for network namespaces.") Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Reviewed-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> Upstream: 7b4577a9da ("openvswitch: Fix net exit"). Acked-by: Andy Zhou <azhou@nicira.com>
2015-02-20 14:59:47 -08:00
static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
struct list_head *head)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
datapath: Fix net exit. Open vSwitch allows moving internal vport to different namespace while still connected to the bridge. But when namespace deleted OVS does not detach these vports, that results in dangling pointer to netdevice which causes kernel panic as follows. This issue is fixed by detaching all ovs ports from the deleted namespace at net-exit. BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [<ffffffffa0aadaa5>] ovs_vport_locate+0x35/0x80 [openvswitch] Oops: 0000 [#1] SMP Call Trace: [<ffffffffa0aa6391>] lookup_vport+0x21/0xd0 [openvswitch] [<ffffffffa0aa65f9>] ovs_vport_cmd_get+0x59/0xf0 [openvswitch] [<ffffffff8167e07c>] genl_family_rcv_msg+0x1bc/0x3e0 [<ffffffff8167e319>] genl_rcv_msg+0x79/0xc0 [<ffffffff8167d919>] netlink_rcv_skb+0xb9/0xe0 [<ffffffff8167deac>] genl_rcv+0x2c/0x40 [<ffffffff8167cffd>] netlink_unicast+0x12d/0x1c0 [<ffffffff8167d3da>] netlink_sendmsg+0x34a/0x6b0 [<ffffffff8162e140>] sock_sendmsg+0xa0/0xe0 [<ffffffff8162e5e8>] ___sys_sendmsg+0x408/0x420 [<ffffffff8162f541>] __sys_sendmsg+0x51/0x90 [<ffffffff8162f592>] SyS_sendmsg+0x12/0x20 [<ffffffff81764ee9>] system_call_fastpath+0x12/0x17 Reported-by: Assaf Muller <amuller@redhat.com> Fixes: 46df7b81454("openvswitch: Add support for network namespaces.") Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Reviewed-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> Upstream: 7b4577a9da ("openvswitch: Fix net exit"). Acked-by: Andy Zhou <azhou@nicira.com>
2015-02-20 14:59:47 -08:00
struct datapath *dp;
list_for_each_entry(dp, &ovs_net->dps, list_node) {
int i;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
continue;
if (dev_net(vport->dev) == dnet)
datapath: Fix net exit. Open vSwitch allows moving internal vport to different namespace while still connected to the bridge. But when namespace deleted OVS does not detach these vports, that results in dangling pointer to netdevice which causes kernel panic as follows. This issue is fixed by detaching all ovs ports from the deleted namespace at net-exit. BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [<ffffffffa0aadaa5>] ovs_vport_locate+0x35/0x80 [openvswitch] Oops: 0000 [#1] SMP Call Trace: [<ffffffffa0aa6391>] lookup_vport+0x21/0xd0 [openvswitch] [<ffffffffa0aa65f9>] ovs_vport_cmd_get+0x59/0xf0 [openvswitch] [<ffffffff8167e07c>] genl_family_rcv_msg+0x1bc/0x3e0 [<ffffffff8167e319>] genl_rcv_msg+0x79/0xc0 [<ffffffff8167d919>] netlink_rcv_skb+0xb9/0xe0 [<ffffffff8167deac>] genl_rcv+0x2c/0x40 [<ffffffff8167cffd>] netlink_unicast+0x12d/0x1c0 [<ffffffff8167d3da>] netlink_sendmsg+0x34a/0x6b0 [<ffffffff8162e140>] sock_sendmsg+0xa0/0xe0 [<ffffffff8162e5e8>] ___sys_sendmsg+0x408/0x420 [<ffffffff8162f541>] __sys_sendmsg+0x51/0x90 [<ffffffff8162f592>] SyS_sendmsg+0x12/0x20 [<ffffffff81764ee9>] system_call_fastpath+0x12/0x17 Reported-by: Assaf Muller <amuller@redhat.com> Fixes: 46df7b81454("openvswitch: Add support for network namespaces.") Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Reviewed-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> Upstream: 7b4577a9da ("openvswitch: Fix net exit"). Acked-by: Andy Zhou <azhou@nicira.com>
2015-02-20 14:59:47 -08:00
list_add(&vport->detach_list, head);
}
}
}
}
static void __net_exit ovs_exit_net(struct net *dnet)
{
struct datapath *dp, *dp_next;
struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id);
struct vport *vport, *vport_next;
struct net *net;
LIST_HEAD(head);
ovs_lock();
list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
__dp_destroy(dp);
datapath: Fix net exit. Open vSwitch allows moving internal vport to different namespace while still connected to the bridge. But when namespace deleted OVS does not detach these vports, that results in dangling pointer to netdevice which causes kernel panic as follows. This issue is fixed by detaching all ovs ports from the deleted namespace at net-exit. BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [<ffffffffa0aadaa5>] ovs_vport_locate+0x35/0x80 [openvswitch] Oops: 0000 [#1] SMP Call Trace: [<ffffffffa0aa6391>] lookup_vport+0x21/0xd0 [openvswitch] [<ffffffffa0aa65f9>] ovs_vport_cmd_get+0x59/0xf0 [openvswitch] [<ffffffff8167e07c>] genl_family_rcv_msg+0x1bc/0x3e0 [<ffffffff8167e319>] genl_rcv_msg+0x79/0xc0 [<ffffffff8167d919>] netlink_rcv_skb+0xb9/0xe0 [<ffffffff8167deac>] genl_rcv+0x2c/0x40 [<ffffffff8167cffd>] netlink_unicast+0x12d/0x1c0 [<ffffffff8167d3da>] netlink_sendmsg+0x34a/0x6b0 [<ffffffff8162e140>] sock_sendmsg+0xa0/0xe0 [<ffffffff8162e5e8>] ___sys_sendmsg+0x408/0x420 [<ffffffff8162f541>] __sys_sendmsg+0x51/0x90 [<ffffffff8162f592>] SyS_sendmsg+0x12/0x20 [<ffffffff81764ee9>] system_call_fastpath+0x12/0x17 Reported-by: Assaf Muller <amuller@redhat.com> Fixes: 46df7b81454("openvswitch: Add support for network namespaces.") Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Reviewed-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> Upstream: 7b4577a9da ("openvswitch: Fix net exit"). Acked-by: Andy Zhou <azhou@nicira.com>
2015-02-20 14:59:47 -08:00
rtnl_lock();
for_each_net(net)
list_vports_from_net(net, dnet, &head);
rtnl_unlock();
/* Detach all vports from given namespace. */
list_for_each_entry_safe(vport, vport_next, &head, detach_list) {
list_del(&vport->detach_list);
ovs_dp_detach_port(vport);
}
ovs_unlock();
cancel_work_sync(&ovs_net->dp_notify_work);
}
static struct pernet_operations ovs_net_ops = {
.init = ovs_init_net,
.exit = ovs_exit_net,
.id = &ovs_net_id,
.size = sizeof(struct ovs_net),
};
DEFINE_COMPAT_PNET_REG_FUNC(device);
static int __init dp_init(void)
{
int err;
BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath %s\n", VERSION);
err = compat_init();
if (err)
goto error;
err = action_fifos_init();
if (err)
goto error_compat_exit;
err = ovs_internal_dev_rtnl_link_register();
if (err)
goto error_action_fifos_exit;
err = ovs_flow_init();
if (err)
goto error_unreg_rtnl_link;
err = ovs_vport_init();
if (err)
goto error_flow_exit;
err = register_pernet_device(&ovs_net_ops);
if (err)
goto error_vport_exit;
err = register_netdevice_notifier(&ovs_dp_device_notifier);
if (err)
goto error_netns_exit;
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
err = ovs_netdev_init();
if (err)
goto error_unreg_notifier;
err = dp_register_genl();
if (err < 0)
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
goto error_unreg_netdev;
return 0;
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
error_unreg_netdev:
ovs_netdev_exit();
error_unreg_notifier:
unregister_netdevice_notifier(&ovs_dp_device_notifier);
error_netns_exit:
unregister_pernet_device(&ovs_net_ops);
error_vport_exit:
ovs_vport_exit();
error_flow_exit:
ovs_flow_exit();
error_unreg_rtnl_link:
ovs_internal_dev_rtnl_link_unregister();
error_action_fifos_exit:
action_fifos_exit();
error_compat_exit:
compat_exit();
error:
return err;
}
static void dp_cleanup(void)
{
dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
datapath: Turn vports with dependencies into separate modules Upstream commit: The internal and netdev vport remain part of openvswitch.ko. Encap vports including vxlan, gre, and geneve can be built as separate modules and are loaded on demand. Modules can be unloaded after use. Datapath ports keep a reference to the vport module during their lifetime. Allows to remove the error prone maintenance of the global list vport_ops_list. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> Also folds in the follow-up commits 9ba559d9ca3 to turned the non-GPL symbol exports to GPL exports, and fa2d8ff4e35 which fixes a module reference release bug. Exports various backwards compat functions linked into the main openvswitch module as GPL symbols to ensure vport modules can use them. Some fiddling with the Makefile was needed to work around the fact that Makefile variables can't contain '-' characters needed to define 'vport-xxx' module sources. Also, Kbuild complains heavily if a $(module)-y = $(module).o is defined which is actually backed with a .c file of the same name. Therefore, a new $(build_multi_modules) variable is defined which lists all module which consist of more than one source file. Upstream: 62b9c8d0372 ("ovs: Turn vports with dependencies into separate modules") Upstream: 9ba559d9ca3 ("openvswitch: Export symbols as GPL symbols.") Upstream: fa2d8ff4e35 ("openvswitch: Return vport module ref before destruction") Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Acked-by: Pravin B Shelar <pshelar@nicira.com>
2015-04-04 08:24:13 +02:00
ovs_netdev_exit();
unregister_netdevice_notifier(&ovs_dp_device_notifier);
unregister_pernet_device(&ovs_net_ops);
rcu_barrier();
ovs_vport_exit();
ovs_flow_exit();
ovs_internal_dev_rtnl_link_unregister();
action_fifos_exit();
compat_exit();
}
module_init(dp_init);
module_exit(dp_cleanup);
MODULE_DESCRIPTION("Open vSwitch switching datapath");
MODULE_LICENSE("GPL");
MODULE_VERSION(VERSION);