2009-06-15 15:11:30 -07:00
|
|
|
/*
|
2010-01-08 16:45:14 -08:00
|
|
|
* Copyright (c) 2009, 2010 Nicira Networks.
|
2009-06-15 15:11:30 -07:00
|
|
|
* Distributed under the terms of the GNU GPL version 2.
|
|
|
|
|
*
|
|
|
|
|
* Significant portions of this file may be copied from parts of the Linux
|
|
|
|
|
* kernel, by Linus Torvalds and others.
|
|
|
|
|
*/
|
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
/* Interface exported by openvswitch_mod. */
|
|
|
|
|
|
|
|
|
|
#ifndef DATAPATH_H
|
|
|
|
|
#define DATAPATH_H 1
|
|
|
|
|
|
|
|
|
|
#include <asm/page.h>
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
|
#include <linux/workqueue.h>
|
|
|
|
|
#include <linux/skbuff.h>
|
2009-06-12 16:45:01 -07:00
|
|
|
#include <linux/version.h>
|
2009-07-08 13:19:16 -07:00
|
|
|
#include "flow.h"
|
2009-08-05 12:56:23 -07:00
|
|
|
#include "dp_sysfs.h"
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
|
/* Mask for the priority bits in a vlan header. If we ever merge upstream
|
|
|
|
|
* then this should go into include/linux/if_vlan.h. */
|
|
|
|
|
#define VLAN_PCP_MASK 0xe000
|
2009-10-08 10:37:43 -07:00
|
|
|
#define VLAN_PCP_SHIFT 13
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2009-09-11 14:32:50 -07:00
|
|
|
#define DP_MAX_PORTS 1024
|
2009-07-08 13:19:16 -07:00
|
|
|
#define DP_MAX_GROUPS 16
|
|
|
|
|
|
2009-09-01 10:31:32 -07:00
|
|
|
#define DP_L2_BITS (PAGE_SHIFT - ilog2(sizeof(struct dp_bucket*)))
|
2009-07-08 13:19:16 -07:00
|
|
|
#define DP_L2_SIZE (1 << DP_L2_BITS)
|
|
|
|
|
#define DP_L2_SHIFT 0
|
|
|
|
|
|
2009-09-01 10:31:32 -07:00
|
|
|
#define DP_L1_BITS (PAGE_SHIFT - ilog2(sizeof(struct dp_bucket**)))
|
2009-07-08 13:19:16 -07:00
|
|
|
#define DP_L1_SIZE (1 << DP_L1_BITS)
|
|
|
|
|
#define DP_L1_SHIFT DP_L2_BITS
|
|
|
|
|
|
2009-09-01 10:31:32 -07:00
|
|
|
/* For 4 kB pages, this is 1,048,576 on 32-bit or 262,144 on 64-bit. */
|
2009-07-08 13:19:16 -07:00
|
|
|
#define DP_MAX_BUCKETS (DP_L1_SIZE * DP_L2_SIZE)
|
|
|
|
|
|
2009-09-01 10:31:32 -07:00
|
|
|
/**
|
|
|
|
|
* struct dp_table - flow table
|
|
|
|
|
* @n_buckets: number of buckets (a power of 2 between %DP_L1_SIZE and
|
|
|
|
|
* %DP_MAX_BUCKETS)
|
|
|
|
|
* @buckets: pointer to @n_buckets/%DP_L1_SIZE pointers to %DP_L1_SIZE pointers
|
|
|
|
|
* to buckets
|
|
|
|
|
* @hash_seed: random number used for flow hashing, to make the hash
|
|
|
|
|
* distribution harder to predict
|
|
|
|
|
* @rcu: RCU callback structure
|
|
|
|
|
*
|
|
|
|
|
* The @buckets array is logically an array of pointers to buckets. It is
|
|
|
|
|
* broken into two levels to avoid the need to kmalloc() any object larger than
|
|
|
|
|
* a single page or to use vmalloc(). @buckets is always nonnull, as is each
|
|
|
|
|
* @buckets[i], but each @buckets[i][j] is nonnull only if the specified hash
|
|
|
|
|
* bucket is nonempty (for 0 <= i < @n_buckets/%DP_L1_SIZE, 0 <= j <
|
|
|
|
|
* %DP_L1_SIZE).
|
|
|
|
|
*/
|
2009-07-08 13:19:16 -07:00
|
|
|
struct dp_table {
|
|
|
|
|
unsigned int n_buckets;
|
2009-09-01 10:31:32 -07:00
|
|
|
struct dp_bucket ***buckets;
|
|
|
|
|
unsigned int hash_seed;
|
|
|
|
|
struct rcu_head rcu;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* struct dp_bucket - single bucket within datapath flow table
|
|
|
|
|
* @rcu: RCU callback structure
|
|
|
|
|
* @n_flows: number of flows in @flows[] array
|
|
|
|
|
* @flows: array of @n_flows pointers to flows
|
|
|
|
|
*
|
|
|
|
|
* The expected number of flows per bucket is 1, but this allows for an
|
|
|
|
|
* arbitrary number of collisions.
|
|
|
|
|
*/
|
|
|
|
|
struct dp_bucket {
|
2009-07-08 13:19:16 -07:00
|
|
|
struct rcu_head rcu;
|
2009-09-01 10:31:32 -07:00
|
|
|
unsigned int n_flows;
|
|
|
|
|
struct sw_flow *flows[];
|
2009-07-08 13:19:16 -07:00
|
|
|
};
|
|
|
|
|
|
2010-01-04 13:08:37 -08:00
|
|
|
#define DP_N_QUEUES 3
|
2009-07-08 13:19:16 -07:00
|
|
|
#define DP_MAX_QUEUE_LEN 100
|
|
|
|
|
|
2010-01-08 16:45:14 -08:00
|
|
|
/**
|
|
|
|
|
* struct dp_stats_percpu - per-cpu packet processing statistics for a given
|
|
|
|
|
* datapath.
|
|
|
|
|
* @n_frags: Number of IP fragments processed by datapath.
|
|
|
|
|
* @n_hit: Number of received packets for which a matching flow was found in
|
|
|
|
|
* the flow table.
|
|
|
|
|
* @n_miss: Number of received packets that had no matching flow in the flow
|
|
|
|
|
* table. The sum of @n_hit and @n_miss is the number of packets that have
|
|
|
|
|
* been received by the datapath.
|
|
|
|
|
* @n_lost: Number of received packets that had no matching flow in the flow
|
|
|
|
|
* table that could not be sent to userspace (normally due to an overflow in
|
|
|
|
|
* one of the datapath's queues).
|
|
|
|
|
*/
|
2009-07-08 13:19:16 -07:00
|
|
|
struct dp_stats_percpu {
|
|
|
|
|
u64 n_frags;
|
|
|
|
|
u64 n_hit;
|
|
|
|
|
u64 n_missed;
|
|
|
|
|
u64 n_lost;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct dp_port_group {
|
|
|
|
|
struct rcu_head rcu;
|
|
|
|
|
int n_ports;
|
|
|
|
|
u16 ports[];
|
|
|
|
|
};
|
|
|
|
|
|
2010-01-04 13:08:37 -08:00
|
|
|
/**
|
|
|
|
|
* struct datapath - datapath for flow-based packet switching
|
|
|
|
|
* @mutex: Mutual exclusion for ioctls.
|
|
|
|
|
* @dp_idx: Datapath number (index into the dps[] array in datapath.c).
|
sflow: Fix sFlow sampling structure.
According to Neil McKee, in an email archived at
http://openvswitch.org/pipermail/dev_openvswitch.org/2010-January/000934.html:
The containment rule is that a given sflow-datasource (sampler or
poller) should be scoped within only one sflow-agent (or
sub-agent). So the issue arrises when you have two
switches/datapaths defined on the same host being managed with
the same IP address: each switch is a separate sub-agent, so they
can run independently (e.g. with their own sequence numbers) but
they can't both claim to speak for the same sflow-datasource.
Specifically, they can't both represent the <ifindex>:0
data-source. This containment rule is necessary so that the
sFlow collector can scale and combine the results accurately.
One option would be to stick with the <ifindex>:0 data-source but
elevate it to be global across all bridges, with a global
sample_pool and a global sflow_agent. Not tempting. Better to
go the other way and allow each interface to have it's own
sampler, just as it already has it's own poller. The ifIndex
numbers are globally unique across all switches/datapaths on the
host, so the containment is now clean. Datasource <ifindex>:5
might be on one switch, whille <ifindex>:7 can be on another.
Other benefits are that 1) you can support the option of
overriding the default sampling-rate on an interface-by-interface
basis, and 2) this is how most sFlow implementations are coded,
so there will be no surprises or interoperability issues with any
sFlow collectors out there.
This commit implements the approach suggested by Neil.
This commit uses an atomic_t to represent the sampling pool. This is
because we do want access to it to be atomic, but we expect that it will
"mostly" be accessed from a single CPU at a time. Perhaps this is a bad
assumption; we can always switch to another form of synchronization later.
CC: Neil McKee <neil.mckee@inmon.com>
2010-01-20 13:52:42 -08:00
|
|
|
* @ifobj: Represents /sys/class/net/<devname>/brif.
|
2010-01-04 13:08:37 -08:00
|
|
|
* @drop_frags: Drop all IP fragments if nonzero.
|
|
|
|
|
* @queues: %DP_N_QUEUES sets of queued packets for userspace to handle.
|
|
|
|
|
* @waitqueue: Waitqueue, for waiting for new packets in @queues.
|
|
|
|
|
* @n_flows: Number of flows currently in flow table.
|
|
|
|
|
* @table: Current flow table (RCU protected).
|
|
|
|
|
* @groups: Port groups, used by ODPAT_OUTPUT_GROUP action (RCU protected).
|
|
|
|
|
* @n_ports: Number of ports currently in @ports.
|
|
|
|
|
* @ports: Map from port number to &struct net_bridge_port. %ODPP_LOCAL port
|
|
|
|
|
* always exists, other ports may be %NULL.
|
|
|
|
|
* @port_list: List of all ports in @ports in arbitrary order.
|
|
|
|
|
* @stats_percpu: Per-CPU datapath statistics.
|
2010-01-08 16:44:43 -08:00
|
|
|
* @sflow_probability: Number of packets out of UINT_MAX to sample to the
|
|
|
|
|
* %ODPL_SFLOW queue, e.g. (@sflow_probability/UINT_MAX) is the probability of
|
|
|
|
|
* sampling a given packet.
|
2010-01-04 13:08:37 -08:00
|
|
|
*/
|
2009-07-08 13:19:16 -07:00
|
|
|
struct datapath {
|
|
|
|
|
struct mutex mutex;
|
|
|
|
|
int dp_idx;
|
|
|
|
|
struct kobject ifobj;
|
|
|
|
|
|
|
|
|
|
int drop_frags;
|
|
|
|
|
|
|
|
|
|
/* Queued data. */
|
|
|
|
|
struct sk_buff_head queues[DP_N_QUEUES];
|
|
|
|
|
wait_queue_head_t waitqueue;
|
|
|
|
|
|
|
|
|
|
/* Flow table. */
|
|
|
|
|
unsigned int n_flows;
|
|
|
|
|
struct dp_table *table;
|
|
|
|
|
|
|
|
|
|
/* Port groups. */
|
|
|
|
|
struct dp_port_group *groups[DP_MAX_GROUPS];
|
|
|
|
|
|
|
|
|
|
/* Switch ports. */
|
|
|
|
|
unsigned int n_ports;
|
|
|
|
|
struct net_bridge_port *ports[DP_MAX_PORTS];
|
2010-01-04 13:08:37 -08:00
|
|
|
struct list_head port_list;
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
|
/* Stats. */
|
|
|
|
|
struct dp_stats_percpu *stats_percpu;
|
2010-01-04 13:08:37 -08:00
|
|
|
|
|
|
|
|
/* sFlow Sampling */
|
|
|
|
|
unsigned int sflow_probability;
|
2009-07-08 13:19:16 -07:00
|
|
|
};
|
|
|
|
|
|
sflow: Fix sFlow sampling structure.
According to Neil McKee, in an email archived at
http://openvswitch.org/pipermail/dev_openvswitch.org/2010-January/000934.html:
The containment rule is that a given sflow-datasource (sampler or
poller) should be scoped within only one sflow-agent (or
sub-agent). So the issue arrises when you have two
switches/datapaths defined on the same host being managed with
the same IP address: each switch is a separate sub-agent, so they
can run independently (e.g. with their own sequence numbers) but
they can't both claim to speak for the same sflow-datasource.
Specifically, they can't both represent the <ifindex>:0
data-source. This containment rule is necessary so that the
sFlow collector can scale and combine the results accurately.
One option would be to stick with the <ifindex>:0 data-source but
elevate it to be global across all bridges, with a global
sample_pool and a global sflow_agent. Not tempting. Better to
go the other way and allow each interface to have it's own
sampler, just as it already has it's own poller. The ifIndex
numbers are globally unique across all switches/datapaths on the
host, so the containment is now clean. Datasource <ifindex>:5
might be on one switch, whille <ifindex>:7 can be on another.
Other benefits are that 1) you can support the option of
overriding the default sampling-rate on an interface-by-interface
basis, and 2) this is how most sFlow implementations are coded,
so there will be no surprises or interoperability issues with any
sFlow collectors out there.
This commit implements the approach suggested by Neil.
This commit uses an atomic_t to represent the sampling pool. This is
because we do want access to it to be atomic, but we expect that it will
"mostly" be accessed from a single CPU at a time. Perhaps this is a bad
assumption; we can always switch to another form of synchronization later.
CC: Neil McKee <neil.mckee@inmon.com>
2010-01-20 13:52:42 -08:00
|
|
|
/**
|
|
|
|
|
* struct net_bridge_port - one port within a datapath
|
|
|
|
|
* @port_no: Index into @dp's @ports array.
|
|
|
|
|
* @dp: Datapath to which this port belongs.
|
|
|
|
|
* @dev: The network device attached to this port. The @br_port member in @dev
|
|
|
|
|
* points back to this &struct net_bridge_port.
|
|
|
|
|
* @kobj: Represents /sys/class/net/<devname>/brport.
|
|
|
|
|
* @linkname: The name of the link from /sys/class/net/<datapath>/brif to this
|
|
|
|
|
* &struct net_bridge_port. (We keep this around so that we can delete it
|
|
|
|
|
* if @dev gets renamed.) Set to the null string when no link exists.
|
|
|
|
|
* @node: Element in @dp's @port_list.
|
|
|
|
|
* @sflow_pool: Number of packets that were candidates for sFlow sampling,
|
|
|
|
|
* regardless of whether they were actually chosen and sent down to userspace.
|
|
|
|
|
*/
|
2009-07-08 13:19:16 -07:00
|
|
|
struct net_bridge_port {
|
|
|
|
|
u16 port_no;
|
|
|
|
|
struct datapath *dp;
|
|
|
|
|
struct net_device *dev;
|
|
|
|
|
struct kobject kobj;
|
2009-08-05 14:36:21 -07:00
|
|
|
char linkname[IFNAMSIZ];
|
sflow: Fix sFlow sampling structure.
According to Neil McKee, in an email archived at
http://openvswitch.org/pipermail/dev_openvswitch.org/2010-January/000934.html:
The containment rule is that a given sflow-datasource (sampler or
poller) should be scoped within only one sflow-agent (or
sub-agent). So the issue arrises when you have two
switches/datapaths defined on the same host being managed with
the same IP address: each switch is a separate sub-agent, so they
can run independently (e.g. with their own sequence numbers) but
they can't both claim to speak for the same sflow-datasource.
Specifically, they can't both represent the <ifindex>:0
data-source. This containment rule is necessary so that the
sFlow collector can scale and combine the results accurately.
One option would be to stick with the <ifindex>:0 data-source but
elevate it to be global across all bridges, with a global
sample_pool and a global sflow_agent. Not tempting. Better to
go the other way and allow each interface to have it's own
sampler, just as it already has it's own poller. The ifIndex
numbers are globally unique across all switches/datapaths on the
host, so the containment is now clean. Datasource <ifindex>:5
might be on one switch, whille <ifindex>:7 can be on another.
Other benefits are that 1) you can support the option of
overriding the default sampling-rate on an interface-by-interface
basis, and 2) this is how most sFlow implementations are coded,
so there will be no surprises or interoperability issues with any
sFlow collectors out there.
This commit implements the approach suggested by Neil.
This commit uses an atomic_t to represent the sampling pool. This is
because we do want access to it to be atomic, but we expect that it will
"mostly" be accessed from a single CPU at a time. Perhaps this is a bad
assumption; we can always switch to another form of synchronization later.
CC: Neil McKee <neil.mckee@inmon.com>
2010-01-20 13:52:42 -08:00
|
|
|
struct list_head node;
|
|
|
|
|
atomic_t sflow_pool;
|
2009-07-08 13:19:16 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
extern struct notifier_block dp_device_notifier;
|
|
|
|
|
extern int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
|
|
|
|
|
|
|
|
|
|
/* Flow table. */
|
|
|
|
|
struct dp_table *dp_table_create(unsigned int n_buckets);
|
|
|
|
|
void dp_table_destroy(struct dp_table *, int free_flows);
|
|
|
|
|
struct sw_flow *dp_table_lookup(struct dp_table *, const struct odp_flow_key *);
|
2009-09-01 10:31:32 -07:00
|
|
|
int dp_table_insert(struct dp_table *, struct sw_flow *);
|
2009-07-08 13:19:16 -07:00
|
|
|
int dp_table_delete(struct dp_table *, struct sw_flow *);
|
|
|
|
|
int dp_table_expand(struct datapath *);
|
|
|
|
|
int dp_table_flush(struct datapath *);
|
|
|
|
|
int dp_table_foreach(struct dp_table *table,
|
|
|
|
|
int (*callback)(struct sw_flow *flow, void *aux),
|
|
|
|
|
void *aux);
|
|
|
|
|
|
|
|
|
|
void dp_process_received_packet(struct sk_buff *, struct net_bridge_port *);
|
datapath: Fix race against workqueue in dp_dev and simplify code.
The dp_dev_destroy() function failed to cancel the xmit_queue work, which
allowed it to run after the device had been destroyed, accessing freed
memory. However, simply canceling the work with cancel_work_sync() would
be insufficient, since other packets could get queued while the work
function was running. Stopping the queue with netif_tx_disable() doesn't
help, because the final action in dp_dev_do_xmit() is to re-enable the TX
queue.
This issue led me to re-examine why the dp_dev needs to use a work_struct
at all. This was implemented in commit 71f13ed0b "Send of0 packets from
workqueue, to avoid recursive locking of ofN device" due to a complaint
from lockdep about recursive locking.
However, there's no actual reason that we need any locking around
dp_dev_xmit(). Until now, it has accepted the standard locking provided
by the network stack. But looking at the other software devices (veth,
loopback), those use NETIF_F_LLTX, which disables this locking, and
presumably do so for this very reason. In fact, the lwn article at
http://lwn.net/Articles/121566/ hints that NETIF_F_LLTX, which is otherwise
discouraged in the kernel, is acceptable for "certain types of software
device."
So this commit switches to using NETIF_F_LLTX for dp_dev and gets rid
of the work_struct.
In the process, I noticed that veth and loopback also take advantage of
a network device destruction "hook" using the net_device "destructor"
member. Using this we can automatically get called on network device
destruction at the point where rtnl_unlock() is called. This allows us
to stop stringing the dp_devs that are being destroyed onto a list so
that we can free them, and thus simplifies the code along all the paths
that call dp_dev_destroy().
This commit gets rid of a call to synchronize_rcu() (disguised as a call
to synchronize_net(), which is a macro that expands to synchronize_rcu()),
so it probably speeds up deleting ports, too.
2009-07-08 12:23:32 -07:00
|
|
|
int dp_del_port(struct net_bridge_port *);
|
2009-07-08 13:19:16 -07:00
|
|
|
int dp_output_control(struct datapath *, struct sk_buff *, int, u32 arg);
|
2009-08-01 00:09:56 -07:00
|
|
|
int dp_min_mtu(const struct datapath *dp);
|
2010-02-01 16:43:44 -05:00
|
|
|
void set_dp_devs_mtu(const struct datapath *dp, struct net_device *dev);
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
|
struct datapath *get_dp(int dp_idx);
|
|
|
|
|
|
|
|
|
|
static inline const char *dp_name(const struct datapath *dp)
|
|
|
|
|
{
|
|
|
|
|
return dp->ports[ODPP_LOCAL]->dev->name;
|
|
|
|
|
}
|
|
|
|
|
|
2010-01-20 13:07:47 -08:00
|
|
|
#if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
|
|
|
|
|
int vswitch_skb_checksum_setup(struct sk_buff *skb);
|
2009-07-08 13:19:16 -07:00
|
|
|
#else
|
2010-01-20 13:07:47 -08:00
|
|
|
static inline int vswitch_skb_checksum_setup(struct sk_buff *skb)
|
2009-07-08 13:19:16 -07:00
|
|
|
{
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2010-01-22 17:26:31 -05:00
|
|
|
void forward_ip_summed(struct sk_buff *skb);
|
2009-08-06 12:53:27 -07:00
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
#endif /* datapath.h */
|