mirror of
https://github.com/openvswitch/ovs
synced 2025-08-22 09:58:01 +00:00
Only kernel datapath supports this action so add a function in dpif.c that checks for that. Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Adrian Moreno <amorenoz@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
979 lines
42 KiB
C
979 lines
42 KiB
C
/*
|
||
* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at:
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*/
|
||
|
||
/*
|
||
* dpif, the DataPath InterFace.
|
||
*
|
||
* In Open vSwitch terminology, a "datapath" is a flow-based software switch.
|
||
* A datapath has no intelligence of its own. Rather, it relies entirely on
|
||
* its client to set up flows. The datapath layer is core to the Open vSwitch
|
||
* software switch: one could say, without much exaggeration, that everything
|
||
* in ovs-vswitchd above dpif exists only to make the correct decisions
|
||
* interacting with dpif.
|
||
*
|
||
* Typically, the client of a datapath is the software switch module in
|
||
* "ovs-vswitchd", but other clients can be written. The "ovs-dpctl" utility
|
||
* is also a (simple) client.
|
||
*
|
||
*
|
||
* Overview
|
||
* ========
|
||
*
|
||
* The terms written in quotes below are defined in later sections.
|
||
*
|
||
* When a datapath "port" receives a packet, it extracts the headers (the
|
||
* "flow"). If the datapath's "flow table" contains a "flow entry" matching
|
||
* the packet, then it executes the "actions" in the flow entry and increments
|
||
* the flow's statistics. If there is no matching flow entry, the datapath
|
||
* instead appends the packet to an "upcall" queue.
|
||
*
|
||
*
|
||
* Ports
|
||
* =====
|
||
*
|
||
* A datapath has a set of ports that are analogous to the ports on an Ethernet
|
||
* switch. At the datapath level, each port has the following information
|
||
* associated with it:
|
||
*
|
||
* - A name, a short string that must be unique within the host. This is
|
||
* typically a name that would be familiar to the system administrator,
|
||
* e.g. "eth0" or "vif1.1", but it is otherwise arbitrary.
|
||
*
|
||
* - A 32-bit port number that must be unique within the datapath but is
|
||
* otherwise arbitrary. The port number is the most important identifier
|
||
* for a port in the datapath interface.
|
||
*
|
||
* - A type, a short string that identifies the kind of port. On a Linux
|
||
* host, typical types are "system" (for a network device such as eth0),
|
||
* "internal" (for a simulated port used to connect to the TCP/IP stack),
|
||
* and "gre" (for a GRE tunnel).
|
||
*
|
||
* - A Netlink PID for each upcall reading thread (see "Upcall Queuing and
|
||
* Ordering" below).
|
||
*
|
||
* The dpif interface has functions for adding and deleting ports. When a
|
||
* datapath implements these (e.g. as the Linux and netdev datapaths do), then
|
||
* Open vSwitch's ovs-vswitchd daemon can directly control what ports are used
|
||
* for switching. Some datapaths might not implement them, or implement them
|
||
* with restrictions on the types of ports that can be added or removed,
|
||
* on systems where port membership can only be changed by some external
|
||
* entity.
|
||
*
|
||
* Each datapath must have a port, sometimes called the "local port", whose
|
||
* name is the same as the datapath itself, with port number 0. The local port
|
||
* cannot be deleted.
|
||
*
|
||
* Ports are available as "struct netdev"s. To obtain a "struct netdev *" for
|
||
* a port named 'name' with type 'port_type', in a datapath of type
|
||
* 'datapath_type', call netdev_open(name, dpif_port_open_type(datapath_type,
|
||
* port_type). The netdev can be used to get and set important data related to
|
||
* the port, such as:
|
||
*
|
||
* - MTU (netdev_get_mtu(), netdev_set_mtu()).
|
||
*
|
||
* - Ethernet address (netdev_get_etheraddr(), netdev_set_etheraddr()).
|
||
*
|
||
* - Statistics such as the number of packets and bytes transmitted and
|
||
* received (netdev_get_stats()).
|
||
*
|
||
* - Carrier status (netdev_get_carrier()).
|
||
*
|
||
* - Link features (netdev_get_features()).
|
||
*
|
||
* - Speed (netdev_get_speed()).
|
||
*
|
||
* - QoS queue configuration (netdev_get_queue(), netdev_set_queue() and
|
||
* related functions.)
|
||
*
|
||
* - Arbitrary port-specific configuration parameters (netdev_get_config(),
|
||
* netdev_set_config()). An example of such a parameter is the IP
|
||
* endpoint for a GRE tunnel.
|
||
*
|
||
*
|
||
* Flow Table
|
||
* ==========
|
||
*
|
||
* The flow table is a collection of "flow entries". Each flow entry contains:
|
||
*
|
||
* - A "flow", that is, a summary of the headers in an Ethernet packet. The
|
||
* flow must be unique within the flow table. Flows are fine-grained
|
||
* entities that include L2, L3, and L4 headers. A single TCP connection
|
||
* consists of two flows, one in each direction.
|
||
*
|
||
* In Open vSwitch userspace, "struct flow" is the typical way to describe
|
||
* a flow, but the datapath interface uses a different data format to
|
||
* allow ABI forward- and backward-compatibility. Refer to OVS_KEY_ATTR_*
|
||
* and "struct ovs_key_*" in include/odp-netlink.h for details.
|
||
* lib/odp-util.h defines several functions for working with these flows.
|
||
*
|
||
* - A "mask" that, for each bit in the flow, specifies whether the datapath
|
||
* should consider the corresponding flow bit when deciding whether a
|
||
* given packet matches the flow entry. The original datapath design did
|
||
* not support matching: every flow entry was exact match. With the
|
||
* addition of a mask, the interface supports datapaths with a spectrum of
|
||
* wildcard matching capabilities, from those that only support exact
|
||
* matches to those that support bitwise wildcarding on the entire flow
|
||
* key, as well as datapaths with capabilities somewhere in between.
|
||
*
|
||
* Datapaths do not provide a way to query their wildcarding capabilities,
|
||
* nor is it expected that the client should attempt to probe for the
|
||
* details of their support. Instead, a client installs flows with masks
|
||
* that wildcard as many bits as acceptable. The datapath then actually
|
||
* wildcards as many of those bits as it can and changes the wildcard bits
|
||
* that it does not support into exact match bits. A datapath that can
|
||
* wildcard any bit, for example, would install the supplied mask, an
|
||
* exact-match only datapath would install an exact-match mask regardless
|
||
* of what mask the client supplied, and a datapath in the middle of the
|
||
* spectrum would selectively change some wildcard bits into exact match
|
||
* bits.
|
||
*
|
||
* Regardless of the requested or installed mask, the datapath retains the
|
||
* original flow supplied by the client. (It does not, for example, "zero
|
||
* out" the wildcarded bits.) This allows the client to unambiguously
|
||
* identify the flow entry in later flow table operations.
|
||
*
|
||
* The flow table does not have priorities; that is, all flow entries have
|
||
* equal priority. Detecting overlapping flow entries is expensive in
|
||
* general, so the datapath is not required to do it. It is primarily the
|
||
* client's responsibility not to install flow entries whose flow and mask
|
||
* combinations overlap.
|
||
*
|
||
* - A list of "actions" that tell the datapath what to do with packets
|
||
* within a flow. Some examples of actions are OVS_ACTION_ATTR_OUTPUT,
|
||
* which transmits the packet out a port, and OVS_ACTION_ATTR_SET, which
|
||
* modifies packet headers. Refer to OVS_ACTION_ATTR_* and "struct
|
||
* ovs_action_*" in include/odp-netlink.h for details. lib/odp-util.h
|
||
* defines several functions for working with datapath actions.
|
||
*
|
||
* The actions list may be empty. This indicates that nothing should be
|
||
* done to matching packets, that is, they should be dropped.
|
||
*
|
||
* (In case you are familiar with OpenFlow, datapath actions are analogous
|
||
* to OpenFlow actions.)
|
||
*
|
||
* - Statistics: the number of packets and bytes that the flow has
|
||
* processed, the last time that the flow processed a packet, and the
|
||
* union of all the TCP flags in packets processed by the flow. (The
|
||
* latter is 0 if the flow is not a TCP flow.)
|
||
*
|
||
* The datapath's client manages the flow table, primarily in reaction to
|
||
* "upcalls" (see below).
|
||
*
|
||
*
|
||
* Upcalls
|
||
* =======
|
||
*
|
||
* A datapath sometimes needs to notify its client that a packet was received.
|
||
* The datapath mechanism to do this is called an "upcall".
|
||
*
|
||
* Upcalls are used in two situations:
|
||
*
|
||
* - When a packet is received, but there is no matching flow entry in its
|
||
* flow table (a flow table "miss"), this causes an upcall of type
|
||
* DPIF_UC_MISS. These are called "miss" upcalls.
|
||
*
|
||
* - A datapath action of type OVS_ACTION_ATTR_USERSPACE causes an upcall of
|
||
* type DPIF_UC_ACTION. These are called "action" upcalls.
|
||
*
|
||
* An upcall contains an entire packet. There is no attempt to, e.g., copy
|
||
* only as much of the packet as normally needed to make a forwarding decision.
|
||
* Such an optimization is doable, but experimental prototypes showed it to be
|
||
* of little benefit because an upcall typically contains the first packet of a
|
||
* flow, which is usually short (e.g. a TCP SYN). Also, the entire packet can
|
||
* sometimes really be needed.
|
||
*
|
||
* After a client reads a given upcall, the datapath is finished with it, that
|
||
* is, the datapath doesn't maintain any lingering state past that point.
|
||
*
|
||
* The latency from the time that a packet arrives at a port to the time that
|
||
* it is received from dpif_recv() is critical in some benchmarks. For
|
||
* example, if this latency is 1 ms, then a netperf TCP_CRR test, which opens
|
||
* and closes TCP connections one at a time as quickly as it can, cannot
|
||
* possibly achieve more than 500 transactions per second, since every
|
||
* connection consists of two flows with 1-ms latency to set up each one.
|
||
*
|
||
* To receive upcalls, a client has to enable them with dpif_recv_set(). A
|
||
* datapath should generally support being opened multiple times (e.g. so that
|
||
* one may run "ovs-dpctl show" or "ovs-dpctl dump-flows" while "ovs-vswitchd"
|
||
* is also running) but need not support more than one of these clients
|
||
* enabling upcalls at once.
|
||
*
|
||
*
|
||
* Upcall Queuing and Ordering
|
||
* ---------------------------
|
||
*
|
||
* The datapath's client reads upcalls one at a time by calling dpif_recv().
|
||
* When more than one upcall is pending, the order in which the datapath
|
||
* presents upcalls to its client is important. The datapath's client does not
|
||
* directly control this order, so the datapath implementer must take care
|
||
* during design.
|
||
*
|
||
* The minimal behavior, suitable for initial testing of a datapath
|
||
* implementation, is that all upcalls are appended to a single queue, which is
|
||
* delivered to the client in order.
|
||
*
|
||
* The datapath should ensure that a high rate of upcalls from one particular
|
||
* port cannot cause upcalls from other sources to be dropped or unreasonably
|
||
* delayed. Otherwise, one port conducting a port scan or otherwise initiating
|
||
* high-rate traffic spanning many flows could suppress other traffic.
|
||
* Ideally, the datapath should present upcalls from each port in a "round
|
||
* robin" manner, to ensure fairness.
|
||
*
|
||
* The client has no control over "miss" upcalls and no insight into the
|
||
* datapath's implementation, so the datapath is entirely responsible for
|
||
* queuing and delivering them. On the other hand, the datapath has
|
||
* considerable freedom of implementation. One good approach is to maintain a
|
||
* separate queue for each port, to prevent any given port's upcalls from
|
||
* interfering with other ports' upcalls. If this is impractical, then another
|
||
* reasonable choice is to maintain some fixed number of queues and assign each
|
||
* port to one of them. Ports assigned to the same queue can then interfere
|
||
* with each other, but not with ports assigned to different queues. Other
|
||
* approaches are also possible.
|
||
*
|
||
* The client has some control over "action" upcalls: it can specify a 32-bit
|
||
* "Netlink PID" as part of the action. This terminology comes from the Linux
|
||
* datapath implementation, which uses a protocol called Netlink in which a PID
|
||
* designates a particular socket and the upcall data is delivered to the
|
||
* socket's receive queue. Generically, though, a Netlink PID identifies a
|
||
* queue for upcalls. The basic requirements on the datapath are:
|
||
*
|
||
* - The datapath must provide a Netlink PID associated with each port. The
|
||
* client can retrieve the PID with dpif_port_get_pid().
|
||
*
|
||
* - The datapath must provide a "special" Netlink PID not associated with
|
||
* any port. dpif_port_get_pid() also provides this PID. (ovs-vswitchd
|
||
* uses this PID to queue special packets that must not be lost even if a
|
||
* port is otherwise busy, such as packets used for tunnel monitoring.)
|
||
*
|
||
* The minimal behavior of dpif_port_get_pid() and the treatment of the Netlink
|
||
* PID in "action" upcalls is that dpif_port_get_pid() returns a constant value
|
||
* and all upcalls are appended to a single queue.
|
||
*
|
||
* The preferred behavior is:
|
||
*
|
||
* - Each port has a PID that identifies the queue used for "miss" upcalls
|
||
* on that port. (Thus, if each port has its own queue for "miss"
|
||
* upcalls, then each port has a different Netlink PID.)
|
||
*
|
||
* - "miss" upcalls for a given port and "action" upcalls that specify that
|
||
* port's Netlink PID add their upcalls to the same queue. The upcalls
|
||
* are delivered to the datapath's client in the order that the packets
|
||
* were received, regardless of whether the upcalls are "miss" or "action"
|
||
* upcalls.
|
||
*
|
||
* - Upcalls that specify the "special" Netlink PID are queued separately.
|
||
*
|
||
*
|
||
* Packet Format
|
||
* =============
|
||
*
|
||
* The datapath interface works with packets in a particular form. This is the
|
||
* form taken by packets received via upcalls (i.e. by dpif_recv()). Packets
|
||
* supplied to the datapath for processing (i.e. to dpif_execute()) also take
|
||
* this form.
|
||
*
|
||
* A VLAN tag is represented by an 802.1Q header. If the layer below the
|
||
* datapath interface uses another representation, then the datapath interface
|
||
* must perform conversion.
|
||
*
|
||
* The datapath interface requires all packets to fit within the MTU. Some
|
||
* operating systems internally process packets larger than MTU, with features
|
||
* such as TSO and UFO. When such a packet passes through the datapath
|
||
* interface, it must be broken into multiple MTU or smaller sized packets for
|
||
* presentation as upcalls. (This does not happen often, because an upcall
|
||
* typically contains the first packet of a flow, which is usually short.)
|
||
*
|
||
* Some operating system TCP/IP stacks maintain packets in an unchecksummed or
|
||
* partially checksummed state until transmission. The datapath interface
|
||
* requires all host-generated packets to be fully checksummed (e.g. IP and TCP
|
||
* checksums must be correct). On such an OS, the datapath interface must fill
|
||
* in these checksums.
|
||
*
|
||
* Packets passed through the datapath interface must be at least 14 bytes
|
||
* long, that is, they must have a complete Ethernet header. They are not
|
||
* required to be padded to the minimum Ethernet length.
|
||
*
|
||
*
|
||
* Typical Usage
|
||
* =============
|
||
*
|
||
* Typically, the client of a datapath begins by configuring the datapath with
|
||
* a set of ports. Afterward, the client runs in a loop polling for upcalls to
|
||
* arrive.
|
||
*
|
||
* For each upcall received, the client examines the enclosed packet and
|
||
* figures out what should be done with it. For example, if the client
|
||
* implements a MAC-learning switch, then it searches the forwarding database
|
||
* for the packet's destination MAC and VLAN and determines the set of ports to
|
||
* which it should be sent. In any case, the client composes a set of datapath
|
||
* actions to properly dispatch the packet and then directs the datapath to
|
||
* execute those actions on the packet (e.g. with dpif_execute()).
|
||
*
|
||
* Most of the time, the actions that the client executed on the packet apply
|
||
* to every packet with the same flow. For example, the flow includes both
|
||
* destination MAC and VLAN ID (and much more), so this is true for the
|
||
* MAC-learning switch example above. In such a case, the client can also
|
||
* direct the datapath to treat any further packets in the flow in the same
|
||
* way, using dpif_flow_put() to add a new flow entry.
|
||
*
|
||
* Other tasks the client might need to perform, in addition to reacting to
|
||
* upcalls, include:
|
||
*
|
||
* - Periodically polling flow statistics, perhaps to supply to its own
|
||
* clients.
|
||
*
|
||
* - Deleting flow entries from the datapath that haven't been used
|
||
* recently, to save memory.
|
||
*
|
||
* - Updating flow entries whose actions should change. For example, if a
|
||
* MAC learning switch learns that a MAC has moved, then it must update
|
||
* the actions of flow entries that sent packets to the MAC at its old
|
||
* location.
|
||
*
|
||
* - Adding and removing ports to achieve a new configuration.
|
||
*
|
||
*
|
||
* Thread-safety
|
||
* =============
|
||
*
|
||
* Most of the dpif functions are fully thread-safe: they may be called from
|
||
* any number of threads on the same or different dpif objects. The exceptions
|
||
* are:
|
||
*
|
||
* - dpif_port_poll() and dpif_port_poll_wait() are conditionally
|
||
* thread-safe: they may be called from different threads only on
|
||
* different dpif objects.
|
||
*
|
||
* - dpif_flow_dump_next() is conditionally thread-safe: It may be called
|
||
* from different threads with the same 'struct dpif_flow_dump', but all
|
||
* other parameters must be different for each thread.
|
||
*
|
||
* - dpif_flow_dump_done() is conditionally thread-safe: All threads that
|
||
* share the same 'struct dpif_flow_dump' must have finished using it.
|
||
* This function must then be called exactly once for a particular
|
||
* dpif_flow_dump to finish the corresponding flow dump operation.
|
||
*
|
||
* - Functions that operate on 'struct dpif_port_dump' are conditionally
|
||
* thread-safe with respect to those objects. That is, one may dump ports
|
||
* from any number of threads at once, but each thread must use its own
|
||
* struct dpif_port_dump.
|
||
*/
|
||
#ifndef DPIF_H
|
||
#define DPIF_H 1
|
||
|
||
#include <stdbool.h>
|
||
#include <stddef.h>
|
||
#include <stdint.h>
|
||
|
||
#include "dpdk.h"
|
||
#include "dp-packet.h"
|
||
#include "netdev.h"
|
||
#include "openflow/openflow.h"
|
||
#include "openvswitch/ofp-meter.h"
|
||
#include "ovs-numa.h"
|
||
#include "packets.h"
|
||
#include "util.h"
|
||
|
||
#ifdef __cplusplus
|
||
extern "C" {
|
||
#endif
|
||
|
||
struct dpif;
|
||
struct dpif_class;
|
||
struct dpif_flow;
|
||
struct ds;
|
||
struct flow;
|
||
struct flow_wildcards;
|
||
struct nlattr;
|
||
struct sset;
|
||
|
||
int dp_register_provider(const struct dpif_class *);
|
||
int dp_unregister_provider(const char *type);
|
||
void dp_disallow_provider(const char *type);
|
||
void dp_enumerate_types(struct sset *types);
|
||
const char *dpif_normalize_type(const char *);
|
||
|
||
int dp_enumerate_names(const char *type, struct sset *names);
|
||
void dp_parse_name(const char *datapath_name, char **name, char **type);
|
||
|
||
int dpif_open(const char *name, const char *type, struct dpif **);
|
||
int dpif_create(const char *name, const char *type, struct dpif **);
|
||
int dpif_create_and_open(const char *name, const char *type, struct dpif **);
|
||
void dpif_close(struct dpif *);
|
||
|
||
bool dpif_run(struct dpif *);
|
||
void dpif_wait(struct dpif *);
|
||
|
||
const char *dpif_name(const struct dpif *);
|
||
const char *dpif_base_name(const struct dpif *);
|
||
const char *dpif_type(const struct dpif *);
|
||
|
||
bool dpif_cleanup_required(const struct dpif *);
|
||
|
||
int dpif_delete(struct dpif *);
|
||
|
||
/* Statistics for a dpif as a whole. */
|
||
struct dpif_dp_stats {
|
||
uint64_t n_hit; /* Number of flow table matches. */
|
||
uint64_t n_missed; /* Number of flow table misses. */
|
||
uint64_t n_lost; /* Number of misses not sent to userspace. */
|
||
uint64_t n_flows; /* Number of flows present. */
|
||
uint64_t n_cache_hit; /* Number of mega flow mask cache hits for
|
||
flow table matches. */
|
||
uint64_t n_mask_hit; /* Number of mega flow masks visited for
|
||
flow table matches. */
|
||
uint32_t n_masks; /* Number of mega flow masks. */
|
||
};
|
||
int dpif_get_dp_stats(const struct dpif *, struct dpif_dp_stats *);
|
||
|
||
int dpif_set_features(struct dpif *, uint32_t new_features);
|
||
|
||
int dpif_get_n_offloaded_flows(struct dpif *dpif, uint64_t *n_flows);
|
||
|
||
|
||
/* Port operations. */
|
||
|
||
const char *dpif_port_open_type(const char *datapath_type,
|
||
const char *port_type);
|
||
int dpif_port_add(struct dpif *, struct netdev *, odp_port_t *port_nop);
|
||
int dpif_port_del(struct dpif *, odp_port_t port_no, bool local_delete);
|
||
|
||
/* A port within a datapath.
|
||
*
|
||
* 'name' and 'type' are suitable for passing to netdev_open(). */
|
||
struct dpif_port {
|
||
char *name; /* Network device name, e.g. "eth0". */
|
||
char *type; /* Network device type, e.g. "system". */
|
||
odp_port_t port_no; /* Port number within datapath. */
|
||
};
|
||
void dpif_port_clone(struct dpif_port *, const struct dpif_port *);
|
||
void dpif_port_destroy(struct dpif_port *);
|
||
bool dpif_port_exists(const struct dpif *dpif, const char *devname);
|
||
int dpif_port_query_by_number(const struct dpif *, odp_port_t port_no,
|
||
struct dpif_port *, bool warn_if_not_found);
|
||
int dpif_port_query_by_name(const struct dpif *, const char *devname,
|
||
struct dpif_port *);
|
||
int dpif_port_get_name(struct dpif *, odp_port_t port_no,
|
||
char *name, size_t name_size);
|
||
uint32_t dpif_port_get_pid(const struct dpif *, odp_port_t port_no);
|
||
|
||
struct dpif_port_dump {
|
||
const struct dpif *dpif;
|
||
int error;
|
||
void *state;
|
||
};
|
||
void dpif_port_dump_start(struct dpif_port_dump *, const struct dpif *);
|
||
bool dpif_port_dump_next(struct dpif_port_dump *, struct dpif_port *);
|
||
int dpif_port_dump_done(struct dpif_port_dump *);
|
||
|
||
/* Iterates through each DPIF_PORT in DPIF, using DUMP as state.
|
||
*
|
||
* Arguments all have pointer type.
|
||
*
|
||
* If you break out of the loop, then you need to free the dump structure by
|
||
* hand using dpif_port_dump_done(). */
|
||
#define DPIF_PORT_FOR_EACH(DPIF_PORT, DUMP, DPIF) \
|
||
for (dpif_port_dump_start(DUMP, DPIF); \
|
||
(dpif_port_dump_next(DUMP, DPIF_PORT) \
|
||
? true \
|
||
: (dpif_port_dump_done(DUMP), false)); \
|
||
)
|
||
|
||
int dpif_port_poll(const struct dpif *, char **devnamep);
|
||
void dpif_port_poll_wait(const struct dpif *);
|
||
|
||
/* Flow table operations. */
|
||
|
||
struct dpif_flow_stats {
|
||
uint64_t n_packets;
|
||
uint64_t n_bytes;
|
||
long long int used;
|
||
uint16_t tcp_flags;
|
||
};
|
||
|
||
/* more statistics info for offloaded packets and bytes */
|
||
struct dpif_flow_detailed_stats {
|
||
uint64_t n_packets;
|
||
uint64_t n_bytes;
|
||
/* n_offload_packets are a subset of n_packets */
|
||
uint64_t n_offload_packets;
|
||
/* n_offload_bytes are a subset of n_bytes */
|
||
uint64_t n_offload_bytes;
|
||
long long int used;
|
||
uint16_t tcp_flags;
|
||
};
|
||
|
||
struct dpif_flow_attrs {
|
||
bool offloaded; /* True if flow is offloaded to HW. */
|
||
const char *dp_layer; /* DP layer the flow is handled in. */
|
||
const char *dp_extra_info; /* Extra information provided by DP. */
|
||
};
|
||
|
||
struct dpif_flow_dump_types {
|
||
bool ovs_flows;
|
||
bool netdev_flows;
|
||
};
|
||
|
||
void dpif_flow_stats_extract(const struct flow *, const struct dp_packet *packet,
|
||
long long int used, struct dpif_flow_stats *);
|
||
void dpif_flow_stats_format(const struct dpif_flow_stats *, struct ds *);
|
||
|
||
enum dpif_flow_put_flags {
|
||
DPIF_FP_CREATE = 1 << 0, /* Allow creating a new flow. */
|
||
DPIF_FP_MODIFY = 1 << 1, /* Allow modifying an existing flow. */
|
||
DPIF_FP_ZERO_STATS = 1 << 2, /* Zero the stats of an existing flow. */
|
||
DPIF_FP_PROBE = 1 << 3 /* Suppress error messages, if any. */
|
||
};
|
||
|
||
bool dpif_probe_feature(struct dpif *, const char *name,
|
||
const struct ofpbuf *key, const struct ofpbuf *actions,
|
||
const ovs_u128 *ufid);
|
||
int dpif_flow_flush(struct dpif *);
|
||
int dpif_flow_put(struct dpif *, enum dpif_flow_put_flags,
|
||
const struct nlattr *key, size_t key_len,
|
||
const struct nlattr *mask, size_t mask_len,
|
||
const struct nlattr *actions, size_t actions_len,
|
||
const ovs_u128 *ufid, const unsigned pmd_id,
|
||
struct dpif_flow_stats *);
|
||
int dpif_flow_del(struct dpif *,
|
||
const struct nlattr *key, size_t key_len,
|
||
const ovs_u128 *ufid, const unsigned pmd_id,
|
||
struct dpif_flow_stats *);
|
||
int dpif_flow_get(struct dpif *,
|
||
const struct nlattr *key, size_t key_len,
|
||
const ovs_u128 *ufid, const unsigned pmd_id,
|
||
struct ofpbuf *, struct dpif_flow *);
|
||
|
||
/* Flow dumping interface
|
||
* ======================
|
||
*
|
||
* This interface allows iteration through all of the flows currently installed
|
||
* in a datapath. It is somewhat complicated by two requirements:
|
||
*
|
||
* - Efficient support for dumping flows in parallel from multiple threads.
|
||
*
|
||
* - Allow callers to avoid making unnecessary copies of data returned by
|
||
* the interface across several flows in cases where the dpif
|
||
* implementation has to maintain a copy of that information anyhow.
|
||
* (That is, allow the client visibility into any underlying batching as
|
||
* part of its own batching.)
|
||
*
|
||
*
|
||
* Usage
|
||
* -----
|
||
*
|
||
* 1. Call dpif_flow_dump_create().
|
||
* 2. In each thread that participates in the dump (which may be just a single
|
||
* thread if parallelism isn't important):
|
||
* (a) Call dpif_flow_dump_thread_create().
|
||
* (b) Call dpif_flow_dump_next() repeatedly until it returns 0.
|
||
* (c) Call dpif_flow_dump_thread_destroy().
|
||
* 3. Call dpif_flow_dump_destroy().
|
||
*
|
||
* All error reporting is deferred to the call to dpif_flow_dump_destroy().
|
||
*/
|
||
struct dpif_flow_dump *dpif_flow_dump_create(const struct dpif *, bool terse,
|
||
struct dpif_flow_dump_types *);
|
||
int dpif_flow_dump_destroy(struct dpif_flow_dump *);
|
||
|
||
struct dpif_flow_dump_thread *dpif_flow_dump_thread_create(
|
||
struct dpif_flow_dump *);
|
||
void dpif_flow_dump_thread_destroy(struct dpif_flow_dump_thread *);
|
||
|
||
#define PMD_ID_NULL OVS_CORE_UNSPEC
|
||
|
||
/* A datapath flow as dumped by dpif_flow_dump_next(). */
|
||
struct dpif_flow {
|
||
const struct nlattr *key; /* Flow key, as OVS_KEY_ATTR_* attrs. */
|
||
size_t key_len; /* 'key' length in bytes. */
|
||
const struct nlattr *mask; /* Flow mask, as OVS_KEY_ATTR_* attrs. */
|
||
size_t mask_len; /* 'mask' length in bytes. */
|
||
const struct nlattr *actions; /* Actions, as OVS_ACTION_ATTR_ */
|
||
size_t actions_len; /* 'actions' length in bytes. */
|
||
ovs_u128 ufid; /* Unique flow identifier. */
|
||
bool ufid_present; /* True if 'ufid' was provided by datapath.*/
|
||
unsigned pmd_id; /* Datapath poll mode driver id. */
|
||
struct dpif_flow_stats stats; /* Flow statistics. */
|
||
struct dpif_flow_attrs attrs; /* Flow attributes. */
|
||
};
|
||
int dpif_flow_dump_next(struct dpif_flow_dump_thread *,
|
||
struct dpif_flow *flows, int max_flows);
|
||
|
||
#define DPIF_FLOW_BUFSIZE 2048
|
||
|
||
/* Operation batching interface.
|
||
*
|
||
* Some datapaths are faster at performing N operations together than the same
|
||
* N operations individually, hence an interface for batching.
|
||
*/
|
||
|
||
enum dpif_op_type {
|
||
DPIF_OP_FLOW_PUT = 1,
|
||
DPIF_OP_FLOW_DEL,
|
||
DPIF_OP_EXECUTE,
|
||
DPIF_OP_FLOW_GET,
|
||
};
|
||
|
||
/* offload_type argument types to (*operate) interface */
|
||
enum dpif_offload_type {
|
||
DPIF_OFFLOAD_AUTO, /* Offload if possible, fallback to software. */
|
||
DPIF_OFFLOAD_NEVER, /* Never offload to hardware. */
|
||
DPIF_OFFLOAD_ALWAYS, /* Always offload to hardware. */
|
||
};
|
||
|
||
/* Add or modify a flow.
|
||
*
|
||
* The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in
|
||
* the 'key_len' bytes starting at 'key'. The associated actions are specified
|
||
* by the Netlink attributes with types OVS_ACTION_ATTR_* in the 'actions_len'
|
||
* bytes starting at 'actions'.
|
||
*
|
||
* - If the flow's key does not exist in the dpif, then the flow will be
|
||
* added if 'flags' includes DPIF_FP_CREATE. Otherwise the operation will
|
||
* fail with ENOENT.
|
||
*
|
||
* If the operation succeeds, then 'stats', if nonnull, will be zeroed.
|
||
*
|
||
* - If the flow's key does exist in the dpif, then the flow's actions will
|
||
* be updated if 'flags' includes DPIF_FP_MODIFY. Otherwise the operation
|
||
* will fail with EEXIST. If the flow's actions are updated, then its
|
||
* statistics will be zeroed if 'flags' includes DPIF_FP_ZERO_STATS, and
|
||
* left as-is otherwise.
|
||
*
|
||
* If the operation succeeds, then 'stats', if nonnull, will be set to the
|
||
* flow's statistics before the update.
|
||
*
|
||
* - If the datapath implements multiple pmd thread with its own flow
|
||
* table, 'pmd_id' should be used to specify the particular polling
|
||
* thread for the operation. PMD_ID_NULL means that the flow should
|
||
* be put on all the polling threads.
|
||
*/
|
||
struct dpif_flow_put {
|
||
/* Input. */
|
||
enum dpif_flow_put_flags flags; /* DPIF_FP_*. */
|
||
const struct nlattr *key; /* Flow to put. */
|
||
size_t key_len; /* Length of 'key' in bytes. */
|
||
const struct nlattr *mask; /* Mask to put. */
|
||
size_t mask_len; /* Length of 'mask' in bytes. */
|
||
const struct nlattr *actions; /* Actions to perform on flow. */
|
||
size_t actions_len; /* Length of 'actions' in bytes. */
|
||
const ovs_u128 *ufid; /* Optional unique flow identifier. */
|
||
unsigned pmd_id; /* Datapath poll mode driver id. */
|
||
|
||
/* Output. */
|
||
struct dpif_flow_stats *stats; /* Optional flow statistics. */
|
||
};
|
||
|
||
/* Delete a flow.
|
||
*
|
||
* The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in
|
||
* the 'key_len' bytes starting at 'key', or the unique identifier 'ufid'. If
|
||
* the flow was created using 'ufid', then 'ufid' must be specified to delete
|
||
* the flow. If both are specified, 'key' will be ignored for flow deletion.
|
||
* Succeeds with status 0 if the flow is deleted, or fails with ENOENT if the
|
||
* dpif does not contain such a flow.
|
||
*
|
||
* Callers should always provide the 'key' to improve dpif logging in the event
|
||
* of errors or unexpected behaviour.
|
||
*
|
||
* If the datapath implements multiple polling thread with its own flow table,
|
||
* 'pmd_id' should be used to specify the particular polling thread for the
|
||
* operation. PMD_ID_NULL means that the flow should be deleted from all the
|
||
* polling threads.
|
||
*
|
||
* If the operation succeeds, then 'stats', if nonnull, will be set to the
|
||
* flow's statistics before its deletion. */
|
||
struct dpif_flow_del {
|
||
/* Input. */
|
||
const struct nlattr *key; /* Flow to delete. */
|
||
size_t key_len; /* Length of 'key' in bytes. */
|
||
const ovs_u128 *ufid; /* Unique identifier of flow to delete. */
|
||
bool terse; /* OK to skip sending/receiving full flow
|
||
* info? */
|
||
unsigned pmd_id; /* Datapath poll mode driver id. */
|
||
|
||
/* Output. */
|
||
struct dpif_flow_stats *stats; /* Optional flow statistics. */
|
||
};
|
||
|
||
/* Executes actions on a specified packet.
|
||
*
|
||
* Performs the 'actions_len' bytes of actions in 'actions' on the Ethernet
|
||
* frame in 'packet' and on the packet metadata in 'md'. May modify both
|
||
* 'packet' and 'md'.
|
||
*
|
||
* Some dpif providers do not implement every action. The Linux kernel
|
||
* datapath, in particular, does not implement ARP field modification. If
|
||
* 'needs_help' is true, the dpif layer executes in userspace all of the
|
||
* actions that it can, and for OVS_ACTION_ATTR_OUTPUT and
|
||
* OVS_ACTION_ATTR_USERSPACE actions it passes the packet through to the dpif
|
||
* implementation.
|
||
*
|
||
* This works even if 'actions_len' is too long for a Netlink attribute. */
|
||
struct dpif_execute {
|
||
/* Input. */
|
||
const struct nlattr *actions; /* Actions to execute on packet. */
|
||
size_t actions_len; /* Length of 'actions' in bytes. */
|
||
bool needs_help;
|
||
bool probe; /* Suppress error messages. */
|
||
unsigned int mtu; /* Maximum transmission unit to fragment.
|
||
0 if not a fragmented packet */
|
||
uint64_t hash; /* Packet flow hash. 0 if not specified. */
|
||
const struct flow *flow; /* Flow extracted from 'packet'. */
|
||
|
||
/* Input, but possibly modified as a side effect of execution. */
|
||
struct dp_packet *packet; /* Packet to execute. */
|
||
};
|
||
|
||
/* Queries the dpif for a flow entry.
|
||
*
|
||
* The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in
|
||
* the 'key_len' bytes starting at 'key', or the unique identifier 'ufid'. If
|
||
* the flow was created using 'ufid', then 'ufid' must be specified to fetch
|
||
* the flow. If both are specified, 'key' will be ignored for the flow query.
|
||
* 'buffer' must point to an initialized buffer, with a recommended size of
|
||
* DPIF_FLOW_BUFSIZE bytes.
|
||
*
|
||
* On success, 'flow' will be populated with the mask, actions, stats and attrs
|
||
* for the datapath flow corresponding to 'key'. The mask and actions may point
|
||
* within '*buffer', or may point at RCU-protected data. Therefore, callers
|
||
* that wish to hold these over quiescent periods must make a copy of these
|
||
* fields before quiescing.
|
||
*
|
||
* Callers should always provide 'key' to improve dpif logging in the event of
|
||
* errors or unexpected behaviour.
|
||
*
|
||
* If the datapath implements multiple polling thread with its own flow table,
|
||
* 'pmd_id' should be used to specify the particular polling thread for the
|
||
* operation. PMD_ID_NULL means that the datapath will return the first
|
||
* matching flow from any poll thread.
|
||
*
|
||
* Succeeds with status 0 if the flow is fetched, or fails with ENOENT if no
|
||
* such flow exists. Other failures are indicated with a positive errno value.
|
||
*/
|
||
struct dpif_flow_get {
|
||
/* Input. */
|
||
const struct nlattr *key; /* Flow to get. */
|
||
size_t key_len; /* Length of 'key' in bytes. */
|
||
const ovs_u128 *ufid; /* Unique identifier of flow to get. */
|
||
unsigned pmd_id; /* Datapath poll mode driver id. */
|
||
struct ofpbuf *buffer; /* Storage for output parameters. */
|
||
|
||
/* Output. */
|
||
struct dpif_flow *flow; /* Resulting flow from datapath. */
|
||
};
|
||
|
||
int dpif_execute(struct dpif *, struct dpif_execute *);
|
||
|
||
struct dpif_op {
|
||
enum dpif_op_type type;
|
||
int error;
|
||
union {
|
||
struct dpif_flow_put flow_put;
|
||
struct dpif_flow_del flow_del;
|
||
struct dpif_execute execute;
|
||
struct dpif_flow_get flow_get;
|
||
};
|
||
};
|
||
|
||
void dpif_operate(struct dpif *, struct dpif_op **ops, size_t n_ops,
|
||
enum dpif_offload_type);
|
||
|
||
/* Queries the datapath for hardware offloads stats.
|
||
*
|
||
* Statistics are written in 'stats' following the 'netdev_custom_stats'
|
||
* format. They are allocated on the heap and must be freed by the caller,
|
||
* using 'netdev_free_custom_stats_counters'.
|
||
*/
|
||
int dpif_offload_stats_get(struct dpif *dpif,
|
||
struct netdev_custom_stats *stats);
|
||
|
||
/* Upcalls. */
|
||
|
||
enum dpif_upcall_type {
|
||
DPIF_UC_MISS, /* Miss in flow table. */
|
||
DPIF_UC_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */
|
||
DPIF_N_UC_TYPES
|
||
};
|
||
|
||
const char *dpif_upcall_type_to_string(enum dpif_upcall_type);
|
||
|
||
/* A packet passed up from the datapath to userspace.
|
||
*
|
||
* The 'packet', 'key' and 'userdata' may point into data in a buffer
|
||
* provided by the caller, so the buffer should be released only after the
|
||
* upcall processing has been finished.
|
||
*
|
||
* While being processed, the 'packet' may be reallocated, so the packet must
|
||
* be separately released with ofpbuf_uninit().
|
||
*/
|
||
struct dpif_upcall {
|
||
/* All types. */
|
||
struct dp_packet packet; /* Packet data,'dp_packet' should be the first
|
||
member to avoid a hole. This is because
|
||
'rte_mbuf' in dp_packet is aligned atleast
|
||
on a 64-byte boundary */
|
||
enum dpif_upcall_type type;
|
||
struct nlattr *key; /* Flow key. */
|
||
size_t key_len; /* Length of 'key' in bytes. */
|
||
ovs_u128 ufid; /* Unique flow identifier for 'key'. */
|
||
struct nlattr *mru; /* Maximum receive unit. */
|
||
struct nlattr *hash; /* Packet hash. */
|
||
struct nlattr *cutlen; /* Number of bytes shrink from the end. */
|
||
|
||
/* DPIF_UC_ACTION only. */
|
||
struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
|
||
struct nlattr *out_tun_key; /* Output tunnel key. */
|
||
struct nlattr *actions; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
|
||
};
|
||
|
||
/* A callback to notify higher layer of dpif about to be purged, so that
|
||
* higher layer could try reacting to this (e.g. grabbing all flow stats
|
||
* before they are gone). This function is currently implemented only by
|
||
* dpif-netdev.
|
||
*
|
||
* The caller needs to provide the 'aux' pointer passed down by higher
|
||
* layer from the dpif_register_notify_cb() function and the 'pmd_id' of
|
||
* the polling thread.
|
||
*/
|
||
typedef void dp_purge_callback(void *aux, unsigned pmd_id);
|
||
|
||
void dpif_register_dp_purge_cb(struct dpif *, dp_purge_callback *, void *aux);
|
||
|
||
/* A callback to process an upcall, currently implemented only by dpif-netdev.
|
||
*
|
||
* The caller provides the 'packet' and 'flow' to process, the corresponding
|
||
* 'ufid' as generated by odp_flow_key_hash(), the polling thread id 'pmd_id',
|
||
* the 'type' of the upcall, and if 'type' is DPIF_UC_ACTION then the
|
||
* 'userdata' attached to the action.
|
||
*
|
||
* The callback must fill in 'actions' with the datapath actions to apply to
|
||
* 'packet'. 'wc' and 'put_actions' will either be both null or both nonnull.
|
||
* If they are nonnull, then the caller will install a flow entry to process
|
||
* all future packets that match 'flow' and 'wc'; the callback must store a
|
||
* wildcard mask suitable for that purpose into 'wc'. If the actions to store
|
||
* into the flow entry are the same as 'actions', then the callback may leave
|
||
* 'put_actions' empty; otherwise it must store the desired actions into
|
||
* 'put_actions'.
|
||
*
|
||
* Returns 0 if successful, ENOSPC if the flow limit has been reached and no
|
||
* flow should be installed, or some otherwise a positive errno value. */
|
||
typedef int upcall_callback(const struct dp_packet *packet,
|
||
const struct flow *flow,
|
||
ovs_u128 *ufid,
|
||
unsigned pmd_id,
|
||
enum dpif_upcall_type type,
|
||
const struct nlattr *userdata,
|
||
struct ofpbuf *actions,
|
||
struct flow_wildcards *wc,
|
||
struct ofpbuf *put_actions,
|
||
void *aux);
|
||
|
||
void dpif_register_upcall_cb(struct dpif *, upcall_callback *, void *aux);
|
||
|
||
int dpif_recv_set(struct dpif *, bool enable);
|
||
int dpif_handlers_set(struct dpif *, uint32_t n_handlers);
|
||
bool dpif_number_handlers_required(struct dpif *, uint32_t *n_handlers);
|
||
int dpif_set_config(struct dpif *, const struct smap *cfg);
|
||
int dpif_port_set_config(struct dpif *, odp_port_t, const struct smap *cfg);
|
||
int dpif_recv(struct dpif *, uint32_t handler_id, struct dpif_upcall *,
|
||
struct ofpbuf *);
|
||
void dpif_recv_purge(struct dpif *);
|
||
void dpif_recv_wait(struct dpif *, uint32_t handler_id);
|
||
void dpif_enable_upcall(struct dpif *);
|
||
void dpif_disable_upcall(struct dpif *);
|
||
|
||
void dpif_print_packet(struct dpif *, struct dpif_upcall *);
|
||
|
||
/* Meters. */
|
||
void dpif_meter_get_features(const struct dpif *,
|
||
struct ofputil_meter_features *);
|
||
int dpif_meter_set(struct dpif *, ofproto_meter_id meter_id,
|
||
struct ofputil_meter_config *);
|
||
int dpif_meter_get(const struct dpif *, ofproto_meter_id meter_id,
|
||
struct ofputil_meter_stats *, uint16_t n_bands);
|
||
int dpif_meter_del(struct dpif *, ofproto_meter_id meter_id,
|
||
struct ofputil_meter_stats *, uint16_t n_bands);
|
||
|
||
/* Bonding. */
|
||
|
||
/* Bit-mask for hashing a flow down to a bucket. */
|
||
#define BOND_MASK 0xff
|
||
#define BOND_BUCKETS (BOND_MASK + 1)
|
||
|
||
int dpif_bond_add(struct dpif *, uint32_t bond_id, odp_port_t *member_map);
|
||
int dpif_bond_del(struct dpif *, uint32_t bond_id);
|
||
int dpif_bond_stats_get(struct dpif *, uint32_t bond_id, uint64_t *n_bytes);
|
||
bool dpif_supports_lb_output_action(const struct dpif *);
|
||
|
||
|
||
/* Cache */
|
||
int dpif_cache_get_supported_levels(struct dpif *dpif, uint32_t *levels);
|
||
int dpif_cache_get_name(struct dpif *dpif, uint32_t level, const char **name);
|
||
int dpif_cache_get_size(struct dpif *dpif, uint32_t level, uint32_t *size);
|
||
int dpif_cache_set_size(struct dpif *dpif, uint32_t level, uint32_t size);
|
||
|
||
|
||
/* Miscellaneous. */
|
||
|
||
void dpif_get_netflow_ids(const struct dpif *,
|
||
uint8_t *engine_type, uint8_t *engine_id);
|
||
|
||
int dpif_queue_to_priority(const struct dpif *, uint32_t queue_id,
|
||
uint32_t *priority);
|
||
|
||
int dpif_get_pmds_for_port(const struct dpif * dpif, odp_port_t port_no,
|
||
unsigned int **pmds, size_t *n);
|
||
|
||
char *dpif_get_dp_version(const struct dpif *);
|
||
bool dpif_supports_tnl_push_pop(const struct dpif *);
|
||
bool dpif_may_support_explicit_drop_action(const struct dpif *);
|
||
bool dpif_may_support_psample(const struct dpif *);
|
||
bool dpif_synced_dp_layers(struct dpif *);
|
||
|
||
/* Log functions. */
|
||
struct vlog_module;
|
||
|
||
void log_flow_message(const struct dpif *dpif, int error,
|
||
const struct vlog_module *module,
|
||
const char *operation,
|
||
const struct nlattr *key, size_t key_len,
|
||
const struct nlattr *mask, size_t mask_len,
|
||
const ovs_u128 *ufid,
|
||
const struct dpif_flow_stats *stats,
|
||
const struct nlattr *actions, size_t actions_len);
|
||
void log_flow_put_message(const struct dpif *,
|
||
const struct vlog_module *,
|
||
const struct dpif_flow_put *,
|
||
int error);
|
||
void log_flow_del_message(const struct dpif *,
|
||
const struct vlog_module *,
|
||
const struct dpif_flow_del *,
|
||
int error);
|
||
void log_execute_message(const struct dpif *,
|
||
const struct vlog_module *,
|
||
const struct dpif_execute *,
|
||
bool subexecute, int error);
|
||
void log_flow_get_message(const struct dpif *,
|
||
const struct vlog_module *,
|
||
const struct dpif_flow_get *,
|
||
int error);
|
||
#ifdef __cplusplus
|
||
}
|
||
#endif
|
||
|
||
#endif /* dpif.h */
|