2009-06-17 14:35:35 -07:00
|
|
|
/*
|
2019-02-13 15:34:21 -08:00
|
|
|
* Copyright (c) 2009-2014, 2018 Nicira, Inc.
|
2009-06-17 14:35:35 -07:00
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at:
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef DPIF_PROVIDER_H
|
|
|
|
#define DPIF_PROVIDER_H 1
|
|
|
|
|
|
|
|
/* Provider interface to dpifs, which provide an interface to an Open vSwitch
|
2010-06-07 14:05:56 -07:00
|
|
|
* datapath. A datapath is a collection of physical or virtual ports that are
|
|
|
|
* exposed over OpenFlow as a single switch. Datapaths and the collections of
|
|
|
|
* ports that they contain may be fixed or dynamic. */
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2010-04-27 09:40:46 -07:00
|
|
|
#include "openflow/openflow.h"
|
2009-06-17 14:35:35 -07:00
|
|
|
#include "dpif.h"
|
2010-04-27 09:40:46 -07:00
|
|
|
#include "util.h"
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2010-01-22 15:14:01 -08:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
2009-06-17 14:35:35 -07:00
|
|
|
/* Open vSwitch datapath interface.
|
|
|
|
*
|
|
|
|
* This structure should be treated as opaque by dpif implementations. */
|
|
|
|
struct dpif {
|
2010-01-22 15:14:01 -08:00
|
|
|
const struct dpif_class *dpif_class;
|
2010-01-22 14:37:10 -05:00
|
|
|
char *base_name;
|
|
|
|
char *full_name;
|
2009-06-17 14:35:35 -07:00
|
|
|
uint8_t netflow_engine_type;
|
|
|
|
uint8_t netflow_engine_id;
|
2018-10-18 21:43:13 +05:30
|
|
|
long long int current_ms;
|
2009-06-17 14:35:35 -07:00
|
|
|
};
|
|
|
|
|
2019-02-13 15:34:21 -08:00
|
|
|
struct dpif_ipf_status;
|
|
|
|
struct ipf_dump_ctx;
|
|
|
|
|
2009-06-17 14:35:35 -07:00
|
|
|
void dpif_init(struct dpif *, const struct dpif_class *, const char *name,
|
|
|
|
uint8_t netflow_engine_type, uint8_t netflow_engine_id);
|
2010-02-01 11:36:01 -05:00
|
|
|
void dpif_uninit(struct dpif *dpif, bool close);
|
|
|
|
|
2009-06-17 14:35:35 -07:00
|
|
|
static inline void dpif_assert_class(const struct dpif *dpif,
|
2010-01-22 15:14:01 -08:00
|
|
|
const struct dpif_class *dpif_class)
|
2009-06-17 14:35:35 -07:00
|
|
|
{
|
2012-11-06 13:14:55 -08:00
|
|
|
ovs_assert(dpif->dpif_class == dpif_class);
|
2009-06-17 14:35:35 -07:00
|
|
|
}
|
|
|
|
|
2014-05-20 11:37:02 -07:00
|
|
|
struct dpif_flow_dump {
|
|
|
|
struct dpif *dpif;
|
2014-10-06 11:14:08 +13:00
|
|
|
bool terse; /* If true, key/mask/actions may be omitted. */
|
2014-05-20 11:37:02 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
dpif_flow_dump_init(struct dpif_flow_dump *dump, const struct dpif *dpif)
|
|
|
|
{
|
|
|
|
dump->dpif = CONST_CAST(struct dpif *, dpif);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct dpif_flow_dump_thread {
|
|
|
|
struct dpif *dpif;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
dpif_flow_dump_thread_init(struct dpif_flow_dump_thread *thread,
|
|
|
|
struct dpif_flow_dump *dump)
|
|
|
|
{
|
|
|
|
thread->dpif = dump->dpif;
|
|
|
|
}
|
|
|
|
|
2015-10-28 11:24:25 -07:00
|
|
|
struct ct_dpif_dump_state;
|
|
|
|
struct ct_dpif_entry;
|
2017-12-07 10:40:03 -08:00
|
|
|
struct ct_dpif_tuple;
|
ct-dpif, dpif-netlink: Add conntrack timeout policy support
This patch first defines the dpif interface for a datapath to support
adding, deleting, getting and dumping conntrack timeout policy.
The timeout policy is identified by a 4 bytes unsigned integer in
datapath, and it currently support timeout for TCP, UDP, and ICMP
protocols.
Moreover, this patch provides the implementation for Linux kernel
datapath in dpif-netlink.
In Linux kernel, the timeout policy is maintained per L3/L4 protocol,
and it is identified by 32 bytes null terminated string. On the other
hand, in vswitchd, the timeout policy is a generic one that consists of
all the supported L4 protocols. Therefore, one of the main task in
dpif-netlink is to break down the generic timeout policy into 6
sub policies (ipv4 tcp, udp, icmp, and ipv6 tcp, udp, icmp),
and push down the configuration using the netlink API in
netlink-conntrack.c.
This patch also adds missing symbols in the windows datapath so
that the build on windows can pass.
Appveyor CI:
* https://ci.appveyor.com/project/YiHungWei/ovs/builds/26387754
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>
Signed-off-by: Justin Pettit <jpettit@ovn.org>
2019-08-28 15:14:24 -07:00
|
|
|
struct ct_dpif_timeout_policy;
|
2015-10-28 11:24:25 -07:00
|
|
|
|
2019-02-13 15:34:21 -08:00
|
|
|
/* 'dpif_ipf_proto_status' and 'dpif_ipf_status' are presently in
|
|
|
|
* sync with 'ipf_proto_status' and 'ipf_status', but more
|
|
|
|
* generally represent a superset of present and future support. */
|
|
|
|
struct dpif_ipf_proto_status {
|
|
|
|
uint64_t nfrag_accepted;
|
|
|
|
uint64_t nfrag_completed_sent;
|
|
|
|
uint64_t nfrag_expired_sent;
|
|
|
|
uint64_t nfrag_too_small;
|
|
|
|
uint64_t nfrag_overlap;
|
|
|
|
uint64_t nfrag_purged;
|
|
|
|
unsigned int min_frag_size;
|
|
|
|
bool enabled;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct dpif_ipf_status {
|
|
|
|
struct dpif_ipf_proto_status v4;
|
|
|
|
struct dpif_ipf_proto_status v6;
|
|
|
|
unsigned int nfrag;
|
|
|
|
unsigned int nfrag_max;
|
|
|
|
};
|
|
|
|
|
2009-06-17 14:35:35 -07:00
|
|
|
/* Datapath interface class structure, to be defined by each implementation of
|
2009-11-30 23:20:57 -08:00
|
|
|
* a datapath interface.
|
2009-06-17 14:35:35 -07:00
|
|
|
*
|
|
|
|
* These functions return 0 if successful or a positive errno value on failure,
|
|
|
|
* except where otherwise noted.
|
|
|
|
*
|
|
|
|
* These functions are expected to execute synchronously, that is, to block as
|
|
|
|
* necessary to obtain a result. Thus, they may not return EAGAIN or
|
|
|
|
* EWOULDBLOCK or EINPROGRESS. We may relax this requirement in the future if
|
|
|
|
* and when we encounter performance problems. */
|
|
|
|
struct dpif_class {
|
2010-01-22 14:37:10 -05:00
|
|
|
/* Type of dpif in this class, e.g. "system", "netdev", etc.
|
2009-06-17 14:35:35 -07:00
|
|
|
*
|
2010-01-22 14:37:10 -05:00
|
|
|
* One of the providers should supply a "system" type, since this is
|
|
|
|
* the type assumed if no type is specified when opening a dpif. */
|
|
|
|
const char *type;
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2019-06-24 17:20:17 +03:00
|
|
|
/* If 'true', datapath ports should be destroyed on ofproto destruction.
|
|
|
|
*
|
|
|
|
* This is used by the vswitch at exit, so that it can clean any
|
|
|
|
* datapaths that can not exist without it (e.g. netdev datapath). */
|
|
|
|
bool cleanup_required;
|
|
|
|
|
2015-04-10 19:09:49 +01:00
|
|
|
/* Called when the dpif provider is registered, typically at program
|
|
|
|
* startup. Returning an error from this function will prevent any
|
|
|
|
* datapath with this class from being created.
|
|
|
|
*
|
|
|
|
* This function may be set to null if a datapath class needs no
|
|
|
|
* initialization at registration time. */
|
|
|
|
int (*init)(void);
|
|
|
|
|
2014-06-12 16:37:33 -07:00
|
|
|
/* Enumerates the names of all known created datapaths (of class
|
|
|
|
* 'dpif_class'), if possible, into 'all_dps'. The caller has already
|
|
|
|
* initialized 'all_dps' and other dpif classes might already have added
|
|
|
|
* names to it.
|
2009-07-06 11:06:36 -07:00
|
|
|
*
|
|
|
|
* This is used by the vswitch at startup, so that it can delete any
|
|
|
|
* datapaths that are not configured.
|
|
|
|
*
|
|
|
|
* Some kinds of datapaths might not be practically enumerable, in which
|
|
|
|
* case this function may be a null pointer. */
|
2014-06-12 16:37:33 -07:00
|
|
|
int (*enumerate)(struct sset *all_dps, const struct dpif_class *dpif_class);
|
2009-07-06 11:06:36 -07:00
|
|
|
|
2012-11-14 15:50:20 -08:00
|
|
|
/* Returns the type to pass to netdev_open() when a dpif of class
|
|
|
|
* 'dpif_class' has a port of type 'type', for a few special cases
|
|
|
|
* when a netdev type differs from a port type. For example, when
|
|
|
|
* using the userspace datapath, a port of type "internal" needs to
|
|
|
|
* be opened as "tap".
|
|
|
|
*
|
|
|
|
* Returns either 'type' itself or a string literal, which must not
|
|
|
|
* be freed. */
|
|
|
|
const char *(*port_open_type)(const struct dpif_class *dpif_class,
|
|
|
|
const char *type);
|
|
|
|
|
2010-01-22 14:37:10 -05:00
|
|
|
/* Attempts to open an existing dpif called 'name', if 'create' is false,
|
|
|
|
* or to open an existing dpif or create a new one, if 'create' is true.
|
2009-06-17 14:35:35 -07:00
|
|
|
*
|
2010-11-18 10:06:41 -08:00
|
|
|
* 'dpif_class' is the class of dpif to open.
|
|
|
|
*
|
|
|
|
* If successful, stores a pointer to the new dpif in '*dpifp', which must
|
|
|
|
* have class 'dpif_class'. On failure there are no requirements on what
|
|
|
|
* is stored in '*dpifp'. */
|
|
|
|
int (*open)(const struct dpif_class *dpif_class,
|
|
|
|
const char *name, bool create, struct dpif **dpifp);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
|
|
|
/* Closes 'dpif' and frees associated memory. */
|
|
|
|
void (*close)(struct dpif *dpif);
|
|
|
|
|
|
|
|
/* Attempts to destroy the dpif underlying 'dpif'.
|
|
|
|
*
|
|
|
|
* If successful, 'dpif' will not be used again except as an argument for
|
|
|
|
* the 'close' member function. */
|
2010-01-22 15:14:01 -08:00
|
|
|
int (*destroy)(struct dpif *dpif);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2014-11-11 11:53:47 -08:00
|
|
|
/* Performs periodic work needed by 'dpif', if any is necessary.
|
|
|
|
* Returns true if need to revalidate. */
|
|
|
|
bool (*run)(struct dpif *dpif);
|
2011-05-06 15:04:29 -07:00
|
|
|
|
|
|
|
/* Arranges for poll_block() to wake up if the "run" member function needs
|
|
|
|
* to be called for 'dpif'. */
|
|
|
|
void (*wait)(struct dpif *dpif);
|
|
|
|
|
2009-06-17 14:35:35 -07:00
|
|
|
/* Retrieves statistics for 'dpif' into 'stats'. */
|
2011-10-05 11:18:13 -07:00
|
|
|
int (*get_stats)(const struct dpif *dpif, struct dpif_dp_stats *stats);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2019-12-22 12:16:38 +02:00
|
|
|
int (*set_features)(struct dpif *dpif, uint32_t user_features);
|
|
|
|
|
2012-07-27 23:58:24 -07:00
|
|
|
/* Adds 'netdev' as a new port in 'dpif'. If '*port_no' is not
|
2016-09-13 13:58:00 -07:00
|
|
|
* ODPP_NONE, attempts to use that as the port's port number.
|
2012-07-27 23:58:24 -07:00
|
|
|
*
|
|
|
|
* If port is successfully added, sets '*port_no' to the new port's
|
|
|
|
* port number. Returns EBUSY if caller attempted to choose a port
|
|
|
|
* number, and it was in use. */
|
2010-12-03 14:41:38 -08:00
|
|
|
int (*port_add)(struct dpif *dpif, struct netdev *netdev,
|
2013-06-19 16:58:44 -07:00
|
|
|
odp_port_t *port_no);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
|
|
|
/* Removes port numbered 'port_no' from 'dpif'. */
|
2013-06-19 16:58:44 -07:00
|
|
|
int (*port_del)(struct dpif *dpif, odp_port_t port_no);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2016-07-27 17:44:42 +03:00
|
|
|
/* Refreshes configuration of 'dpif's port. The implementation might
|
|
|
|
* postpone applying the changes until run() is called. */
|
|
|
|
int (*port_set_config)(struct dpif *dpif, odp_port_t port_no,
|
|
|
|
const struct smap *cfg);
|
|
|
|
|
2012-10-17 23:11:53 -07:00
|
|
|
/* Queries 'dpif' for a port with the given 'port_no' or 'devname'.
|
|
|
|
* If 'port' is not null, stores information about the port into
|
|
|
|
* '*port' if successful.
|
2011-01-23 18:48:02 -08:00
|
|
|
*
|
2017-01-05 20:21:23 -08:00
|
|
|
* If the port doesn't exist, the provider must return ENODEV. Other
|
|
|
|
* error numbers means that something wrong happened and will be
|
|
|
|
* treated differently by upper layers.
|
|
|
|
*
|
2012-10-17 23:11:53 -07:00
|
|
|
* If 'port' is not null, the caller takes ownership of data in
|
|
|
|
* 'port' and must free it with dpif_port_destroy() when it is no
|
|
|
|
* longer needed. */
|
2013-06-19 16:58:44 -07:00
|
|
|
int (*port_query_by_number)(const struct dpif *dpif, odp_port_t port_no,
|
2011-01-23 18:48:02 -08:00
|
|
|
struct dpif_port *port);
|
2009-06-17 14:35:35 -07:00
|
|
|
int (*port_query_by_name)(const struct dpif *dpif, const char *devname,
|
2011-01-23 18:48:02 -08:00
|
|
|
struct dpif_port *port);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2011-10-12 16:24:54 -07:00
|
|
|
/* Returns the Netlink PID value to supply in OVS_ACTION_ATTR_USERSPACE
|
|
|
|
* actions as the OVS_USERSPACE_ATTR_PID attribute's value, for use in
|
2018-09-25 15:14:13 -07:00
|
|
|
* flows whose packets arrived on port 'port_no'.
|
2011-10-12 16:24:54 -07:00
|
|
|
*
|
2012-09-26 16:22:47 -07:00
|
|
|
* A 'port_no' of UINT32_MAX should be treated as a special case. The
|
2012-05-05 11:07:42 -07:00
|
|
|
* implementation should return a reserved PID, not allocated to any port,
|
|
|
|
* that the client may use for special purposes.
|
|
|
|
*
|
2011-10-12 16:24:54 -07:00
|
|
|
* The return value only needs to be meaningful when DPIF_UC_ACTION has
|
|
|
|
* been enabled in the 'dpif''s listen mask, and it is allowed to change
|
|
|
|
* when DPIF_UC_ACTION is disabled and then re-enabled.
|
|
|
|
*
|
|
|
|
* A dpif provider that doesn't have meaningful Netlink PIDs can use NULL
|
|
|
|
* for this function. This is equivalent to always returning 0. */
|
2018-09-25 15:14:13 -07:00
|
|
|
uint32_t (*port_get_pid)(const struct dpif *dpif, odp_port_t port_no);
|
2011-10-12 16:24:54 -07:00
|
|
|
|
datapath: Change listing ports to use an iterator concept.
One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other. To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software. In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed. Neither choice is very attractive.
This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call. It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.
It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.
The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-10 13:12:12 -08:00
|
|
|
/* Attempts to begin dumping the ports in a dpif. On success, returns 0
|
|
|
|
* and initializes '*statep' with any data needed for iteration. On
|
|
|
|
* failure, returns a positive errno value. */
|
|
|
|
int (*port_dump_start)(const struct dpif *dpif, void **statep);
|
|
|
|
|
|
|
|
/* Attempts to retrieve another port from 'dpif' for 'state', which was
|
|
|
|
* initialized by a successful call to the 'port_dump_start' function for
|
2011-01-23 18:48:02 -08:00
|
|
|
* 'dpif'. On success, stores a new dpif_port into 'port' and returns 0.
|
datapath: Change listing ports to use an iterator concept.
One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other. To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software. In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed. Neither choice is very attractive.
This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call. It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.
It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.
The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-10 13:12:12 -08:00
|
|
|
* Returns EOF if the end of the port table has been reached, or a positive
|
|
|
|
* errno value on error. This function will not be called again once it
|
|
|
|
* returns nonzero once for a given iteration (but the 'port_dump_done'
|
2011-01-23 18:48:02 -08:00
|
|
|
* function will be called afterward).
|
|
|
|
*
|
|
|
|
* The dpif provider retains ownership of the data stored in 'port'. It
|
|
|
|
* must remain valid until at least the next call to 'port_dump_next' or
|
|
|
|
* 'port_dump_done' for 'state'. */
|
datapath: Change listing ports to use an iterator concept.
One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other. To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software. In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed. Neither choice is very attractive.
This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call. It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.
It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.
The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-10 13:12:12 -08:00
|
|
|
int (*port_dump_next)(const struct dpif *dpif, void *state,
|
2011-01-23 18:48:02 -08:00
|
|
|
struct dpif_port *port);
|
datapath: Change listing ports to use an iterator concept.
One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other. To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software. In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed. Neither choice is very attractive.
This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call. It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.
It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.
The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-10 13:12:12 -08:00
|
|
|
|
|
|
|
/* Releases resources from 'dpif' for 'state', which was initialized by a
|
|
|
|
* successful call to the 'port_dump_start' function for 'dpif'. */
|
|
|
|
int (*port_dump_done)(const struct dpif *dpif, void *state);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2009-06-24 10:24:09 -07:00
|
|
|
/* Polls for changes in the set of ports in 'dpif'. If the set of ports in
|
|
|
|
* 'dpif' has changed, then this function should do one of the
|
|
|
|
* following:
|
|
|
|
*
|
|
|
|
* - Preferably: store the name of the device that was added to or deleted
|
|
|
|
* from 'dpif' in '*devnamep' and return 0. The caller is responsible
|
|
|
|
* for freeing '*devnamep' (with free()) when it no longer needs it.
|
|
|
|
*
|
|
|
|
* - Alternatively: return ENOBUFS, without indicating the device that was
|
|
|
|
* added or deleted.
|
|
|
|
*
|
|
|
|
* Occasional 'false positives', in which the function returns 0 while
|
|
|
|
* indicating a device that was not actually added or deleted or returns
|
|
|
|
* ENOBUFS without any change, are acceptable.
|
|
|
|
*
|
|
|
|
* If the set of ports in 'dpif' has not changed, returns EAGAIN. May also
|
|
|
|
* return other positive errno values to indicate that something has gone
|
|
|
|
* wrong. */
|
|
|
|
int (*port_poll)(const struct dpif *dpif, char **devnamep);
|
|
|
|
|
|
|
|
/* Arranges for the poll loop to wake up when 'port_poll' will return a
|
|
|
|
* value other than EAGAIN. */
|
|
|
|
void (*port_poll_wait)(const struct dpif *dpif);
|
|
|
|
|
2009-06-17 14:35:35 -07:00
|
|
|
/* Deletes all flows from 'dpif' and clears all of its queues of received
|
|
|
|
* packets. */
|
|
|
|
int (*flow_flush)(struct dpif *dpif);
|
|
|
|
|
2014-05-20 11:37:02 -07:00
|
|
|
/* Flow dumping interface.
|
2013-06-19 07:15:10 +00:00
|
|
|
*
|
2014-05-20 11:37:02 -07:00
|
|
|
* This is the back-end for the flow dumping interface described in
|
|
|
|
* dpif.h. Please read the comments there first, because this code
|
|
|
|
* closely follows it.
|
2013-06-19 07:15:10 +00:00
|
|
|
*
|
2014-05-20 11:37:02 -07:00
|
|
|
* 'flow_dump_create' and 'flow_dump_thread_create' must always return an
|
|
|
|
* initialized and usable data structure and defer error return until
|
|
|
|
* flow_dump_destroy(). This hasn't been a problem for the dpifs that
|
|
|
|
* exist so far.
|
datapath: Change listing flows to use an iterator concept.
One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other. To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length. This does
not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form,
because that would require userspace to know how much space to allocate
for each flow's key in advance, or to allocate as much space as could
possibly be needed. Neither choice is very attractive.
This commit prepares for a different solution, by replacing ODP_FLOW_LIST
by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath
on each call. It is much cleaner to allocate the maximum amount of space
for a single flow key than to do so for possibly a very large number of
flow keys.
As a side effect, this patch also fixes a race condition that sometimes
made "ovs-dpctl dump-flows" print an error: previously, flows were listed
and then their actions were retrieved, which left a window in which
ovs-vswitchd could delete the flow. Now dumping a flow and its actions is
a single step, closing that window.
Dumping all of the flows in a datapath is no longer an atomic step, so now
it is possible to miss some flows or see a single flow twice during
iteration, if the flow table is modified by another process. It doesn't
look like this should be a problem for ovs-vswitchd.
It would be faster to retrieve a number of flows in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
|
|
|
*
|
2014-05-20 11:37:02 -07:00
|
|
|
* 'flow_dump_create' and 'flow_dump_thread_create' must initialize the
|
|
|
|
* structures that they return with dpif_flow_dump_init() and
|
2014-10-06 11:14:08 +13:00
|
|
|
* dpif_flow_dump_thread_init(), respectively.
|
|
|
|
*
|
|
|
|
* If 'terse' is true, then only UID and statistics will
|
2017-06-13 18:03:49 +03:00
|
|
|
* be returned in the dump. Otherwise, all fields will be returned.
|
|
|
|
*
|
2018-08-10 11:30:08 +03:00
|
|
|
* If 'types' isn't null, dumps only the flows of the passed types. */
|
|
|
|
struct dpif_flow_dump *(*flow_dump_create)(
|
|
|
|
const struct dpif *dpif,
|
|
|
|
bool terse,
|
|
|
|
struct dpif_flow_dump_types *types);
|
2014-05-20 11:37:02 -07:00
|
|
|
int (*flow_dump_destroy)(struct dpif_flow_dump *dump);
|
|
|
|
|
|
|
|
struct dpif_flow_dump_thread *(*flow_dump_thread_create)(
|
|
|
|
struct dpif_flow_dump *dump);
|
|
|
|
void (*flow_dump_thread_destroy)(struct dpif_flow_dump_thread *thread);
|
|
|
|
|
|
|
|
int (*flow_dump_next)(struct dpif_flow_dump_thread *thread,
|
|
|
|
struct dpif_flow *flows, int max_flows);
|
2011-09-27 15:08:50 -07:00
|
|
|
/* Executes each of the 'n_ops' operations in 'ops' on 'dpif', in the order
|
|
|
|
* in which they are specified, placing each operation's results in the
|
2014-07-15 16:09:40 -07:00
|
|
|
* "output" members documented in comments and the 'error' member of each
|
revalidator: Rebalance offloaded flows based on the pps rate
This is the third patch in the patch-set to support dynamic rebalancing
of offloaded flows.
The dynamic rebalancing functionality is implemented in this patch. The
ukeys that are not scheduled for deletion are obtained and passed as input
to the rebalancing routine. The rebalancing is done in the context of
revalidation leader thread, after all other revalidator threads are
done with gathering rebalancing data for flows.
For each netdev that is in OOR state, a list of flows - both offloaded
and non-offloaded (pending) - is obtained using the ukeys. For each netdev
that is in OOR state, the flows are grouped and sorted into offloaded and
pending flows. The offloaded flows are sorted in descending order of
pps-rate, while pending flows are sorted in ascending order of pps-rate.
The rebalancing is done in two phases. In the first phase, we try to
offload all pending flows and if that succeeds, the OOR state on the device
is cleared. If some (or none) of the pending flows could not be offloaded,
then we start replacing an offloaded flow that has a lower pps-rate than
a pending flow, until there are no more pending flows with a higher rate
than an offloaded flow. The flows that are replaced from the device are
added into kernel datapath.
A new OVS configuration parameter "offload-rebalance", is added to ovsdb.
The default value of this is "false". To enable this feature, set the
value of this parameter to "true", which provides packets-per-second
rate based policy to dynamically offload and un-offload flows.
Note: This option can be enabled only when 'hw-offload' policy is enabled.
It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow
offload errors (specifically ENOSPC error this feature depends on) reported
by an offloaded device are supressed by TC-Flower kernel module.
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
Reviewed-by: Ben Pfaff <blp@ovn.org>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
|
|
|
* dpif_op. The offload_type argument tells the provider if 'ops' should
|
|
|
|
* be submitted to to a netdev (only offload) or to the kernel datapath
|
|
|
|
* (never offload) or to both (offload if possible; software fallback). */
|
|
|
|
void (*operate)(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
|
|
|
|
enum dpif_offload_type offload_type);
|
2011-09-27 15:08:50 -07:00
|
|
|
|
2012-01-12 17:09:22 -08:00
|
|
|
/* Enables or disables receiving packets with dpif_recv() for 'dpif'.
|
|
|
|
* Turning packet receive off and then back on is allowed to change Netlink
|
2011-10-12 16:24:54 -07:00
|
|
|
* PID assignments (see ->port_get_pid()). The client is responsible for
|
|
|
|
* updating flows as necessary if it does this. */
|
2012-01-12 17:09:22 -08:00
|
|
|
int (*recv_set)(struct dpif *dpif, bool enable);
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2014-03-07 10:57:36 -08:00
|
|
|
/* Refreshes the poll loops and Netlink sockets associated to each port,
|
|
|
|
* when the number of upcall handlers (upcall receiving thread) is changed
|
|
|
|
* to 'n_handlers' and receiving packets for 'dpif' is enabled by
|
|
|
|
* recv_set().
|
|
|
|
*
|
|
|
|
* Since multiple upcall handlers can read upcalls simultaneously from
|
|
|
|
* 'dpif', each port can have multiple Netlink sockets, one per upcall
|
|
|
|
* handler. So, handlers_set() is responsible for the following tasks:
|
|
|
|
*
|
|
|
|
* When receiving upcall is enabled, extends or creates the
|
|
|
|
* configuration to support:
|
|
|
|
*
|
|
|
|
* - 'n_handlers' Netlink sockets for each port.
|
|
|
|
*
|
|
|
|
* - 'n_handlers' poll loops, one for each upcall handler.
|
|
|
|
*
|
|
|
|
* - registering the Netlink sockets for the same upcall handler to
|
|
|
|
* the corresponding poll loop.
|
|
|
|
* */
|
|
|
|
int (*handlers_set)(struct dpif *dpif, uint32_t n_handlers);
|
|
|
|
|
2017-01-27 16:41:36 -08:00
|
|
|
/* Pass custom configuration options to the datapath. The implementation
|
|
|
|
* might postpone applying the changes until run() is called. */
|
|
|
|
int (*set_config)(struct dpif *dpif, const struct smap *other_config);
|
2014-09-08 15:22:26 -07:00
|
|
|
|
2010-07-20 11:23:21 -07:00
|
|
|
/* Translates OpenFlow queue ID 'queue_id' (in host byte order) into a
|
2011-11-01 10:13:16 -07:00
|
|
|
* priority value used for setting packet priority. */
|
2010-07-20 11:23:21 -07:00
|
|
|
int (*queue_to_priority)(const struct dpif *dpif, uint32_t queue_id,
|
|
|
|
uint32_t *priority);
|
|
|
|
|
2014-03-07 10:57:36 -08:00
|
|
|
/* Polls for an upcall from 'dpif' for an upcall handler. Since there
|
|
|
|
* can be multiple poll loops (see ->handlers_set()), 'handler_id' is
|
|
|
|
* needed as index to identify the corresponding poll loop. If
|
|
|
|
* successful, stores the upcall into '*upcall', using 'buf' for
|
|
|
|
* storage. Should only be called if 'recv_set' has been used to enable
|
|
|
|
* receiving packets from 'dpif'.
|
2009-06-17 14:35:35 -07:00
|
|
|
*
|
2013-12-16 08:14:52 -08:00
|
|
|
* The implementation should point 'upcall->key' and 'upcall->userdata'
|
|
|
|
* (if any) into data in the caller-provided 'buf'. The implementation may
|
|
|
|
* also use 'buf' for storing the data of 'upcall->packet'. If necessary
|
|
|
|
* to make room, the implementation may reallocate the data in 'buf'.
|
|
|
|
*
|
|
|
|
* The caller owns the data of 'upcall->packet' and may modify it. If
|
|
|
|
* packet's headroom is exhausted as it is manipulated, 'upcall->packet'
|
|
|
|
* will be reallocated. This requires the data of 'upcall->packet' to be
|
2013-12-17 15:54:30 -08:00
|
|
|
* released with ofpbuf_uninit() before 'upcall' is destroyed. However,
|
|
|
|
* when an error is returned, the 'upcall->packet' may be uninitialized
|
|
|
|
* and should not be released.
|
datapath: Report kernel's flow key when passing packets up to userspace.
One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other. To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
This commit takes one step in that direction by making the kernel report
its idea of the flow that a packet belongs to whenever it passes a packet
up to userspace. This means that userspace can intelligently figure out
what to do:
- If userspace's notion of the flow for the packet matches the kernel's,
then nothing special is necessary.
- If the kernel has a more specific notion for the flow than userspace,
for example if the kernel decoded IPv6 headers but userspace stopped
at the Ethernet type (because it does not understand IPv6), then again
nothing special is necessary: userspace can still set up the flow in
the usual way.
- If userspace has a more specific notion for the flow than the kernel,
for example if userspace decoded an IPv6 header but the kernel
stopped at the Ethernet type, then userspace can forward the packet
manually, without setting up a flow in the kernel. (This case is
bad from a performance point of view, but at least it is correct.)
This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, although userspace does now
have enough information to do that intelligently. This will have to wait
for later commits.
This commit is bigger than it would otherwise be because it is rolled
together with changing "struct odp_msg" to a sequence of Netlink
attributes. The alternative, to do each of those changes in a separate
patch, seemed like overkill because it meant that either we would have to
introduce and then kill off Netlink attributes for in_port and tun_id, if
Netlink conversion went first, or shove yet another variable-length header
into the stuff already after odp_msg, if adding the flow key to odp_msg
went first.
This commit will slow down performance of checksumming packets sent up to
userspace. I'm not entirely pleased with how I did it. I considered a
couple of alternatives, but none of them seemed that much better.
Suggestions welcome. Not changing anything wasn't an option,
unfortunately. At any rate some slowdown will become unavoidable when OVS
actually starts using Netlink instead of just Netlink framing.
(Actually, I thought of one option where we could avoid that: make
userspace do the checksum instead, by passing csum_start and csum_offset as
part of what goes to userspace. But that's not perfect either.)
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-24 14:59:57 -08:00
|
|
|
*
|
|
|
|
* This function must not block. If no upcall is pending when it is
|
|
|
|
* called, it should return EAGAIN without blocking. */
|
2014-03-07 10:57:36 -08:00
|
|
|
int (*recv)(struct dpif *dpif, uint32_t handler_id,
|
|
|
|
struct dpif_upcall *upcall, struct ofpbuf *buf);
|
|
|
|
|
|
|
|
/* Arranges for the poll loop for an upcall handler to wake up when 'dpif'
|
|
|
|
* has a message queued to be received with the recv member functions.
|
|
|
|
* Since there can be multiple poll loops (see ->handlers_set()),
|
|
|
|
* 'handler_id' is needed as index to identify the corresponding poll loop.
|
|
|
|
* */
|
|
|
|
void (*recv_wait)(struct dpif *dpif, uint32_t handler_id);
|
2011-01-04 17:00:36 -08:00
|
|
|
|
|
|
|
/* Throws away any queued upcalls that 'dpif' currently has ready to
|
|
|
|
* return. */
|
|
|
|
void (*recv_purge)(struct dpif *dpif);
|
2014-07-26 06:51:55 +00:00
|
|
|
|
2015-08-25 16:36:46 -07:00
|
|
|
/* When 'dpif' is about to purge the datapath, the higher layer may want
|
|
|
|
* to be notified so that it could try reacting accordingly (e.g. grabbing
|
|
|
|
* all flow stats before they are gone).
|
|
|
|
*
|
|
|
|
* Registers an upcall callback function with 'dpif'. This is only used
|
|
|
|
* if 'dpif' needs to notify the purging of datapath. 'aux' is passed to
|
|
|
|
* the callback on invocation. */
|
|
|
|
void (*register_dp_purge_cb)(struct dpif *, dp_purge_callback *, void *aux);
|
|
|
|
|
2014-07-26 06:51:55 +00:00
|
|
|
/* For datapaths that run in userspace (i.e. dpif-netdev), threads polling
|
|
|
|
* for incoming packets can directly call upcall functions instead of
|
|
|
|
* offloading packet processing to separate handler threads. Datapaths
|
|
|
|
* that directly call upcall functions should use the functions below to
|
|
|
|
* to register an upcall function and enable / disable upcalls.
|
|
|
|
*
|
2015-08-25 16:36:46 -07:00
|
|
|
* Registers an upcall callback function with 'dpif'. This is only used
|
2014-07-26 15:39:58 -07:00
|
|
|
* if 'dpif' directly executes upcall functions. 'aux' is passed to the
|
|
|
|
* callback on invocation. */
|
|
|
|
void (*register_upcall_cb)(struct dpif *, upcall_callback *, void *aux);
|
2014-07-26 06:51:55 +00:00
|
|
|
|
|
|
|
/* Enables upcalls if 'dpif' directly executes upcall functions. */
|
|
|
|
void (*enable_upcall)(struct dpif *);
|
|
|
|
|
|
|
|
/* Disables upcalls if 'dpif' directly executes upcall functions. */
|
|
|
|
void (*disable_upcall)(struct dpif *);
|
2014-10-16 15:23:11 -07:00
|
|
|
|
|
|
|
/* Get datapath version. Caller is responsible for freeing the string
|
|
|
|
* returned. */
|
|
|
|
char *(*get_datapath_version)(void);
|
2015-10-28 11:24:25 -07:00
|
|
|
|
|
|
|
/* Conntrack entry dumping interface.
|
|
|
|
*
|
|
|
|
* These functions are used by ct-dpif.c to provide a datapath-agnostic
|
2016-08-11 12:50:55 -07:00
|
|
|
* dumping interface to the connection trackers provided by the
|
2015-10-28 11:24:25 -07:00
|
|
|
* datapaths.
|
|
|
|
*
|
|
|
|
* ct_dump_start() should put in '*state' a pointer to a newly allocated
|
|
|
|
* stucture that will be passed by the caller to ct_dump_next() and
|
|
|
|
* ct_dump_done(). If 'zone' is not NULL, only the entries in '*zone'
|
|
|
|
* should be dumped.
|
|
|
|
*
|
|
|
|
* ct_dump_next() should fill 'entry' with information from a connection
|
|
|
|
* and prepare to dump the next one on a subsequest invocation.
|
|
|
|
*
|
2016-08-11 12:50:55 -07:00
|
|
|
* ct_dump_done() should perform any cleanup necessary (including
|
2015-10-28 11:24:25 -07:00
|
|
|
* deallocating the 'state' structure, if applicable). */
|
|
|
|
int (*ct_dump_start)(struct dpif *, struct ct_dpif_dump_state **state,
|
2017-08-01 20:12:03 -07:00
|
|
|
const uint16_t *zone, int *);
|
2016-08-11 12:50:55 -07:00
|
|
|
int (*ct_dump_next)(struct dpif *, struct ct_dpif_dump_state *state,
|
2015-10-28 11:24:25 -07:00
|
|
|
struct ct_dpif_entry *entry);
|
|
|
|
int (*ct_dump_done)(struct dpif *, struct ct_dpif_dump_state *state);
|
2015-10-28 10:32:32 -07:00
|
|
|
|
2017-12-07 10:40:03 -08:00
|
|
|
/* Flushes the connection tracking tables. The arguments have the
|
|
|
|
* following behavior:
|
|
|
|
*
|
|
|
|
* - If both 'zone' and 'tuple' are NULL, flush all the conntrack
|
|
|
|
* entries.
|
|
|
|
* - If 'zone' is not NULL, and 'tuple' is NULL, flush all the
|
|
|
|
* conntrack entries in '*zone'.
|
|
|
|
* - If 'tuple' is not NULL, flush the conntrack entry specified by
|
|
|
|
* 'tuple' in '*zone'. If 'zone' is NULL, use the default zone
|
|
|
|
* (zone 0). */
|
|
|
|
int (*ct_flush)(struct dpif *, const uint16_t *zone,
|
|
|
|
const struct ct_dpif_tuple *tuple);
|
2018-01-08 15:18:42 -08:00
|
|
|
/* Set max connections allowed. */
|
|
|
|
int (*ct_set_maxconns)(struct dpif *, uint32_t maxconns);
|
|
|
|
/* Get max connections allowed. */
|
|
|
|
int (*ct_get_maxconns)(struct dpif *, uint32_t *maxconns);
|
2018-01-08 15:18:43 -08:00
|
|
|
/* Get number of connections tracked. */
|
|
|
|
int (*ct_get_nconns)(struct dpif *, uint32_t *nconns);
|
2019-09-25 14:09:41 -07:00
|
|
|
/* Enable or disable TCP sequence checking. */
|
|
|
|
int (*ct_set_tcp_seq_chk)(struct dpif *, bool enabled);
|
|
|
|
/* Get the TCP sequence checking configuration. */
|
|
|
|
int (*ct_get_tcp_seq_chk)(struct dpif *, bool *enabled);
|
|
|
|
|
2017-02-23 11:27:54 -08:00
|
|
|
|
2018-08-17 02:05:07 -07:00
|
|
|
/* Connection tracking per zone limit */
|
|
|
|
|
|
|
|
/* Per zone conntrack limit sets the maximum allowed connections in zones
|
|
|
|
* to provide resource isolation. If a per zone limit for a particular
|
|
|
|
* zone is not available in the datapath, it defaults to the default
|
|
|
|
* per zone limit. Initially, the default per zone limit is
|
|
|
|
* unlimited (0). */
|
|
|
|
|
|
|
|
/* Sets the max connections allowed per zone according to 'zone_limits',
|
|
|
|
* a list of 'struct ct_dpif_zone_limit' entries (the 'count' member
|
|
|
|
* is not used when setting limits). If 'default_limit' is not NULL,
|
|
|
|
* modifies the default limit to '*default_limit'. */
|
|
|
|
int (*ct_set_limits)(struct dpif *, const uint32_t *default_limit,
|
|
|
|
const struct ovs_list *zone_limits);
|
|
|
|
|
|
|
|
/* Looks up the default per zone limit and stores that in
|
|
|
|
* 'default_limit'. Look up the per zone limits for all zones in
|
|
|
|
* the 'zone_limits_in' list of 'struct ct_dpif_zone_limit' entries
|
|
|
|
* (the 'limit' and 'count' members are not used), and stores the
|
|
|
|
* reply that includes the zone, the per zone limit, and the number
|
|
|
|
* of connections in the zone into 'zone_limits_out' list. */
|
|
|
|
int (*ct_get_limits)(struct dpif *, uint32_t *default_limit,
|
|
|
|
const struct ovs_list *zone_limits_in,
|
|
|
|
struct ovs_list *zone_limits_out);
|
|
|
|
|
|
|
|
/* Deletes per zone limit of all zones specified in 'zone_limits', a
|
|
|
|
* list of 'struct ct_dpif_zone_limit' entries. */
|
|
|
|
int (*ct_del_limits)(struct dpif *, const struct ovs_list *zone_limits);
|
|
|
|
|
ct-dpif, dpif-netlink: Add conntrack timeout policy support
This patch first defines the dpif interface for a datapath to support
adding, deleting, getting and dumping conntrack timeout policy.
The timeout policy is identified by a 4 bytes unsigned integer in
datapath, and it currently support timeout for TCP, UDP, and ICMP
protocols.
Moreover, this patch provides the implementation for Linux kernel
datapath in dpif-netlink.
In Linux kernel, the timeout policy is maintained per L3/L4 protocol,
and it is identified by 32 bytes null terminated string. On the other
hand, in vswitchd, the timeout policy is a generic one that consists of
all the supported L4 protocols. Therefore, one of the main task in
dpif-netlink is to break down the generic timeout policy into 6
sub policies (ipv4 tcp, udp, icmp, and ipv6 tcp, udp, icmp),
and push down the configuration using the netlink API in
netlink-conntrack.c.
This patch also adds missing symbols in the windows datapath so
that the build on windows can pass.
Appveyor CI:
* https://ci.appveyor.com/project/YiHungWei/ovs/builds/26387754
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>
Signed-off-by: Justin Pettit <jpettit@ovn.org>
2019-08-28 15:14:24 -07:00
|
|
|
/* Connection tracking timeout policy */
|
|
|
|
|
|
|
|
/* A connection tracking timeout policy contains a list of timeout
|
|
|
|
* attributes that specify timeout values on various connection states.
|
|
|
|
* In a datapath, the timeout policy is identified by a 4-byte unsigned
|
|
|
|
* integer. Unsupported timeout attributes are ignored. When a
|
|
|
|
* connection is committed it can be associated with a timeout
|
|
|
|
* policy, or it defaults to the datapath's default timeout policy. */
|
|
|
|
|
|
|
|
/* Sets timeout policy '*tp' into the datapath. */
|
|
|
|
int (*ct_set_timeout_policy)(struct dpif *,
|
|
|
|
const struct ct_dpif_timeout_policy *tp);
|
|
|
|
/* Gets a timeout policy specified by tp_id and stores it into '*tp'. */
|
|
|
|
int (*ct_get_timeout_policy)(struct dpif *, uint32_t tp_id,
|
|
|
|
struct ct_dpif_timeout_policy *tp);
|
|
|
|
/* Deletes a timeout policy identified by 'tp_id'. */
|
|
|
|
int (*ct_del_timeout_policy)(struct dpif *, uint32_t tp_id);
|
|
|
|
|
|
|
|
/* Conntrack timeout policy dumping interface.
|
|
|
|
*
|
|
|
|
* These functions provide a datapath-agnostic dumping interface
|
|
|
|
* to the conntrack timeout policy provided by the datapaths.
|
|
|
|
*
|
|
|
|
* ct_timeout_policy_dump_start() should put in '*statep' a pointer to
|
|
|
|
* a newly allocated structure that will be passed by the caller to
|
|
|
|
* ct_timeout_policy_dump_next() and ct_timeout_policy_dump_done().
|
|
|
|
*
|
|
|
|
* ct_timeout_policy_dump_next() attempts to retrieve another timeout
|
|
|
|
* policy from 'dpif' for 'state', which was initialized by a successful
|
|
|
|
* call to ct_timeout_policy_dump_start(). On success, stores a new
|
|
|
|
* timeout policy into 'tp' and returns 0. Returns EOF if the last
|
|
|
|
* timeout policy has been dumped, or a positive errno value on error.
|
|
|
|
* This function will not be called again once it returns nonzero once
|
|
|
|
* for a given iteration (but the ct_timeout_policy_dump_done() will
|
|
|
|
* be called afterward).
|
|
|
|
*
|
|
|
|
* ct_timeout_policy_dump_done() should perform any cleanup necessary
|
|
|
|
* (including deallocating the 'state' structure, if applicable). */
|
|
|
|
int (*ct_timeout_policy_dump_start)(struct dpif *, void **statep);
|
|
|
|
int (*ct_timeout_policy_dump_next)(struct dpif *, void *state,
|
|
|
|
struct ct_dpif_timeout_policy *tp);
|
|
|
|
int (*ct_timeout_policy_dump_done)(struct dpif *, void *state);
|
|
|
|
|
2019-08-28 15:14:29 -07:00
|
|
|
/* Gets timeout policy based on 'tp_id', 'dl_type' and 'nw_proto'.
|
|
|
|
* On success, returns 0, stores the timeout policy name in 'tp_name',
|
|
|
|
* and sets 'is_generic'. 'is_generic' is false if the returned timeout
|
|
|
|
* policy in the 'dpif' is specific to 'dl_type' and 'nw_proto' in the
|
|
|
|
* datapath (e.g., the Linux kernel datapath). Sets 'is_generic' to
|
|
|
|
* true, if the timeout policy supports all OVS supported L3/L4
|
|
|
|
* protocols.
|
|
|
|
*
|
|
|
|
* The caller is responsible for freeing 'tp_name'. */
|
|
|
|
int (*ct_get_timeout_policy_name)(struct dpif *, uint32_t tp_id,
|
|
|
|
uint16_t dl_type, uint8_t nw_proto,
|
|
|
|
char **tp_name, bool *is_generic);
|
|
|
|
|
2019-02-13 15:34:21 -08:00
|
|
|
/* IP Fragmentation. */
|
|
|
|
|
|
|
|
/* Disables or enables conntrack fragment reassembly. The default
|
|
|
|
* setting is enabled. */
|
|
|
|
int (*ipf_set_enabled)(struct dpif *, bool v6, bool enabled);
|
|
|
|
|
|
|
|
/* Set minimum fragment allowed. */
|
|
|
|
int (*ipf_set_min_frag)(struct dpif *, bool v6, uint32_t min_frag);
|
|
|
|
|
|
|
|
/* Set maximum number of fragments tracked. */
|
|
|
|
int (*ipf_set_max_nfrags)(struct dpif *, uint32_t max_nfrags);
|
|
|
|
|
|
|
|
/* Get fragmentation configuration status and counters. */
|
|
|
|
int (*ipf_get_status)(struct dpif *,
|
|
|
|
struct dpif_ipf_status *dpif_ipf_status);
|
|
|
|
|
|
|
|
/* The following 3 apis find and print ipf lists by creating a string
|
|
|
|
* representation of the state of an ipf list, to which 'dump' is pointed
|
|
|
|
* to. 'ipf_dump_start()' allocates memory for 'ipf_dump_ctx'.
|
|
|
|
* 'ipf_dump_next()' finds the next ipf list and copies it's
|
|
|
|
* characteristics to a string, which is freed by the caller.
|
|
|
|
* 'ipf_dump_done()' frees the 'ipf_dump_ctx' that was allocated in
|
|
|
|
* 'ipf_dump_start'. */
|
|
|
|
int (*ipf_dump_start)(struct dpif *, struct ipf_dump_ctx **ipf_dump_ctx);
|
|
|
|
int (*ipf_dump_next)(struct dpif *, void *ipf_dump_ctx, char **dump);
|
|
|
|
int (*ipf_dump_done)(struct dpif *, void *ipf_dump_ctx);
|
|
|
|
|
2017-02-23 11:27:54 -08:00
|
|
|
/* Meters */
|
|
|
|
|
|
|
|
/* Queries 'dpif' for supported meter features.
|
|
|
|
* NULL pointer means no meter features are supported. */
|
|
|
|
void (*meter_get_features)(const struct dpif *,
|
|
|
|
struct ofputil_meter_features *);
|
|
|
|
|
2018-08-07 19:51:26 -07:00
|
|
|
/* Adds or modifies the meter in 'dpif' with the given 'meter_id'
|
|
|
|
* and the configuration in 'config'.
|
2017-02-23 11:27:54 -08:00
|
|
|
*
|
2018-08-07 19:51:26 -07:00
|
|
|
* The meter id specified through 'config->meter_id' is ignored. */
|
|
|
|
int (*meter_set)(struct dpif *, ofproto_meter_id meter_id,
|
2017-02-23 11:27:54 -08:00
|
|
|
struct ofputil_meter_config *);
|
|
|
|
|
|
|
|
/* Queries 'dpif' for meter stats with the given 'meter_id'. Stores
|
|
|
|
* maximum of 'n_bands' meter statistics, returning the number of band
|
|
|
|
* stats returned in 'stats->n_bands' if successful. */
|
|
|
|
int (*meter_get)(const struct dpif *, ofproto_meter_id meter_id,
|
|
|
|
struct ofputil_meter_stats *, uint16_t n_bands);
|
|
|
|
|
|
|
|
/* Removes meter 'meter_id' from 'dpif'. Stores meter and band statistics
|
|
|
|
* (for maximum of 'n_bands', returning the number of band stats returned
|
|
|
|
* in 'stats->n_bands' if successful. 'stats' may be passed in as NULL if
|
|
|
|
* no stats are needed, in which case 'n_bands' must be passed in as
|
|
|
|
* zero. */
|
|
|
|
int (*meter_del)(struct dpif *, ofproto_meter_id meter_id,
|
|
|
|
struct ofputil_meter_stats *, uint16_t n_bands);
|
userspace: Avoid dp_hash recirculation for balance-tcp bond mode.
Problem:
In OVS, flows with output over a bond interface of type “balance-tcp”
gets translated by the ofproto layer into "HASH" and "RECIRC" datapath
actions. After recirculation, the packet is forwarded to the bond
member port based on 8-bits of the datapath hash value computed through
dp_hash. This causes performance degradation in the following ways:
1. The recirculation of the packet implies another lookup of the
packet’s flow key in the exact match cache (EMC) and potentially
Megaflow classifier (DPCLS). This is the biggest cost factor.
2. The recirculated packets have a new “RSS” hash and compete with the
original packets for the scarce number of EMC slots. This implies more
EMC misses and potentially EMC thrashing causing costly DPCLS lookups.
3. The 256 extra megaflow entries per bond for dp_hash bond selection
put additional load on the revalidation threads.
Owing to this performance degradation, deployments stick to “balance-slb”
bond mode even though it does not do active-active load balancing for
VXLAN- and GRE-tunnelled traffic because all tunnel packet have the
same source MAC address.
Proposed optimization:
This proposal introduces a new load-balancing output action instead of
recirculation.
Maintain one table per-bond (could just be an array of uint16's) and
program it the same way internal flows are created today for each
possible hash value (256 entries) from ofproto layer. Use this table to
load-balance flows as part of output action processing.
Currently xlate_normal() -> output_normal() ->
bond_update_post_recirc_rules() -> bond_may_recirc() and
compose_output_action__() generate 'dp_hash(hash_l4(0))' and
'recirc(<RecircID>)' actions. In this case the RecircID identifies the
bond. For the recirculated packets the ofproto layer installs megaflow
entries that match on RecircID and masked dp_hash and send them to the
corresponding output port.
Instead, we will now generate action as
'lb_output(<bond id>)'
This combines hash computation (only if needed, else re-use RSS hash)
and inline load-balancing over the bond. This action is used *only* for
balance-tcp bonds in userspace datapath (the OVS kernel datapath
remains unchanged).
Example:
Current scheme:
With 8 UDP flows (with random UDP src port):
flow-dump from pmd on cpu core: 2
recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1)
recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1
New scheme:
We can do with a single flow entry (for any number of new flows):
in_port(7),<...> actions:lb_output(1)
A new CLI has been added to dump datapath bond cache as given below.
# ovs-appctl dpif-netdev/bond-show [dp]
Bond cache:
bond-id 1 :
bucket 0 - slave 2
bucket 1 - slave 1
bucket 2 - slave 2
bucket 3 - slave 1
Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Tested-by: Matteo Croce <mcroce@redhat.com>
Tested-by: Adrian Moreno <amorenoz@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
|
|
|
|
2020-06-17 14:16:08 -07:00
|
|
|
/* Adds a bond with 'bond_id' and the member-map to 'dpif'. */
|
userspace: Avoid dp_hash recirculation for balance-tcp bond mode.
Problem:
In OVS, flows with output over a bond interface of type “balance-tcp”
gets translated by the ofproto layer into "HASH" and "RECIRC" datapath
actions. After recirculation, the packet is forwarded to the bond
member port based on 8-bits of the datapath hash value computed through
dp_hash. This causes performance degradation in the following ways:
1. The recirculation of the packet implies another lookup of the
packet’s flow key in the exact match cache (EMC) and potentially
Megaflow classifier (DPCLS). This is the biggest cost factor.
2. The recirculated packets have a new “RSS” hash and compete with the
original packets for the scarce number of EMC slots. This implies more
EMC misses and potentially EMC thrashing causing costly DPCLS lookups.
3. The 256 extra megaflow entries per bond for dp_hash bond selection
put additional load on the revalidation threads.
Owing to this performance degradation, deployments stick to “balance-slb”
bond mode even though it does not do active-active load balancing for
VXLAN- and GRE-tunnelled traffic because all tunnel packet have the
same source MAC address.
Proposed optimization:
This proposal introduces a new load-balancing output action instead of
recirculation.
Maintain one table per-bond (could just be an array of uint16's) and
program it the same way internal flows are created today for each
possible hash value (256 entries) from ofproto layer. Use this table to
load-balance flows as part of output action processing.
Currently xlate_normal() -> output_normal() ->
bond_update_post_recirc_rules() -> bond_may_recirc() and
compose_output_action__() generate 'dp_hash(hash_l4(0))' and
'recirc(<RecircID>)' actions. In this case the RecircID identifies the
bond. For the recirculated packets the ofproto layer installs megaflow
entries that match on RecircID and masked dp_hash and send them to the
corresponding output port.
Instead, we will now generate action as
'lb_output(<bond id>)'
This combines hash computation (only if needed, else re-use RSS hash)
and inline load-balancing over the bond. This action is used *only* for
balance-tcp bonds in userspace datapath (the OVS kernel datapath
remains unchanged).
Example:
Current scheme:
With 8 UDP flows (with random UDP src port):
flow-dump from pmd on cpu core: 2
recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1)
recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1
New scheme:
We can do with a single flow entry (for any number of new flows):
in_port(7),<...> actions:lb_output(1)
A new CLI has been added to dump datapath bond cache as given below.
# ovs-appctl dpif-netdev/bond-show [dp]
Bond cache:
bond-id 1 :
bucket 0 - slave 2
bucket 1 - slave 1
bucket 2 - slave 2
bucket 3 - slave 1
Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Tested-by: Matteo Croce <mcroce@redhat.com>
Tested-by: Adrian Moreno <amorenoz@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
|
|
|
int (*bond_add)(struct dpif *dpif, uint32_t bond_id,
|
2020-06-17 14:16:08 -07:00
|
|
|
odp_port_t *member_map);
|
userspace: Avoid dp_hash recirculation for balance-tcp bond mode.
Problem:
In OVS, flows with output over a bond interface of type “balance-tcp”
gets translated by the ofproto layer into "HASH" and "RECIRC" datapath
actions. After recirculation, the packet is forwarded to the bond
member port based on 8-bits of the datapath hash value computed through
dp_hash. This causes performance degradation in the following ways:
1. The recirculation of the packet implies another lookup of the
packet’s flow key in the exact match cache (EMC) and potentially
Megaflow classifier (DPCLS). This is the biggest cost factor.
2. The recirculated packets have a new “RSS” hash and compete with the
original packets for the scarce number of EMC slots. This implies more
EMC misses and potentially EMC thrashing causing costly DPCLS lookups.
3. The 256 extra megaflow entries per bond for dp_hash bond selection
put additional load on the revalidation threads.
Owing to this performance degradation, deployments stick to “balance-slb”
bond mode even though it does not do active-active load balancing for
VXLAN- and GRE-tunnelled traffic because all tunnel packet have the
same source MAC address.
Proposed optimization:
This proposal introduces a new load-balancing output action instead of
recirculation.
Maintain one table per-bond (could just be an array of uint16's) and
program it the same way internal flows are created today for each
possible hash value (256 entries) from ofproto layer. Use this table to
load-balance flows as part of output action processing.
Currently xlate_normal() -> output_normal() ->
bond_update_post_recirc_rules() -> bond_may_recirc() and
compose_output_action__() generate 'dp_hash(hash_l4(0))' and
'recirc(<RecircID>)' actions. In this case the RecircID identifies the
bond. For the recirculated packets the ofproto layer installs megaflow
entries that match on RecircID and masked dp_hash and send them to the
corresponding output port.
Instead, we will now generate action as
'lb_output(<bond id>)'
This combines hash computation (only if needed, else re-use RSS hash)
and inline load-balancing over the bond. This action is used *only* for
balance-tcp bonds in userspace datapath (the OVS kernel datapath
remains unchanged).
Example:
Current scheme:
With 8 UDP flows (with random UDP src port):
flow-dump from pmd on cpu core: 2
recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1)
recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2
recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1
recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1
New scheme:
We can do with a single flow entry (for any number of new flows):
in_port(7),<...> actions:lb_output(1)
A new CLI has been added to dump datapath bond cache as given below.
# ovs-appctl dpif-netdev/bond-show [dp]
Bond cache:
bond-id 1 :
bucket 0 - slave 2
bucket 1 - slave 1
bucket 2 - slave 2
bucket 3 - slave 1
Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com>
Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Tested-by: Matteo Croce <mcroce@redhat.com>
Tested-by: Adrian Moreno <amorenoz@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
|
|
|
|
|
|
|
/* Removes bond identified by 'bond_id' from 'dpif'. */
|
|
|
|
int (*bond_del)(struct dpif *dpif, uint32_t bond_id);
|
|
|
|
|
|
|
|
/* Reads bond stats from 'dpif'. 'n_bytes' should be an array with size
|
|
|
|
* sufficient to store BOND_BUCKETS number of elements. */
|
|
|
|
int (*bond_stats_get)(struct dpif *dpif, uint32_t bond_id,
|
|
|
|
uint64_t *n_bytes);
|
2009-06-17 14:35:35 -07:00
|
|
|
};
|
|
|
|
|
2014-09-18 04:17:54 -07:00
|
|
|
extern const struct dpif_class dpif_netlink_class;
|
2009-06-19 14:09:39 -07:00
|
|
|
extern const struct dpif_class dpif_netdev_class;
|
2009-06-17 14:35:35 -07:00
|
|
|
|
2010-01-22 15:14:01 -08:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-06-17 14:35:35 -07:00
|
|
|
#endif /* dpif-provider.h */
|