2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00
ovs/lib/dpif.c

2141 lines
67 KiB
C
Raw Normal View History

/*
* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include "dpif-provider.h"
#include <ctype.h>
#include <errno.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include "coverage.h"
#include "dp-packet.h"
#include "dpctl.h"
#include "dpif-netdev.h"
#include "flow.h"
#include "netdev-offload.h"
#include "netdev-provider.h"
#include "netdev.h"
#include "netlink.h"
#include "odp-execute.h"
#include "odp-util.h"
#include "packets.h"
#include "route-table.h"
#include "seq.h"
#include "sset.h"
#include "timeval.h"
#include "tnl-neigh-cache.h"
#include "tnl-ports.h"
#include "util.h"
#include "uuid.h"
#include "valgrind.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/ofp-errors.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/ofpbuf.h"
#include "openvswitch/poll-loop.h"
#include "openvswitch/shash.h"
#include "openvswitch/usdt-probes.h"
#include "openvswitch/vlog.h"
VLOG_DEFINE_THIS_MODULE(dpif);
COVERAGE_DEFINE(dpif_destroy);
COVERAGE_DEFINE(dpif_execute);
COVERAGE_DEFINE(dpif_execute_error);
COVERAGE_DEFINE(dpif_execute_with_help);
COVERAGE_DEFINE(dpif_flow_del);
COVERAGE_DEFINE(dpif_flow_del_error);
COVERAGE_DEFINE(dpif_flow_flush);
COVERAGE_DEFINE(dpif_flow_get);
COVERAGE_DEFINE(dpif_flow_get_error);
COVERAGE_DEFINE(dpif_flow_put);
COVERAGE_DEFINE(dpif_flow_put_error);
COVERAGE_DEFINE(dpif_meter_del);
COVERAGE_DEFINE(dpif_meter_get);
COVERAGE_DEFINE(dpif_meter_set);
COVERAGE_DEFINE(dpif_port_add);
COVERAGE_DEFINE(dpif_port_del);
COVERAGE_DEFINE(dpif_purge);
static const struct dpif_class *base_dpif_classes[] = {
#if defined(__linux__) || defined(_WIN32)
&dpif_netlink_class,
#endif
&dpif_netdev_class,
};
struct registered_dpif_class {
const struct dpif_class *dpif_class;
int refcount;
};
static struct shash dpif_classes = SHASH_INITIALIZER(&dpif_classes);
static struct sset dpif_disallowed = SSET_INITIALIZER(&dpif_disallowed);
/* Protects 'dpif_classes', including the refcount, and 'dpif_disallowed'. */
static struct ovs_mutex dpif_mutex = OVS_MUTEX_INITIALIZER;
/* Rate limit for individual messages going to or from the datapath, output at
* DBG level. This is very high because, if these are enabled, it is because
* we really need to see them. */
static struct vlog_rate_limit dpmsg_rl = VLOG_RATE_LIMIT_INIT(600, 600);
/* Not really much point in logging many dpif errors. */
static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5);
static void log_operation(const struct dpif *, const char *operation,
int error);
static bool should_log_flow_message(const struct vlog_module *module,
int error);
/* Incremented whenever tnl route, arp, etc changes. */
struct seq *tnl_conf_seq;
static bool
dpif_is_tap_port(const char *type)
{
return !strcmp(type, "tap");
}
static void
dp_initialize(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
if (ovsthread_once_start(&once)) {
int i;
tnl_conf_seq = seq_create();
dpctl: add ovs-appctl dpctl/* commands to talk to dpif-netdev This commit introduces multiple appctl commands (dpctl/*) They are needed to interact with userspace datapaths (dpif-netdev), because the ovs-dpctl command runs in a separate process and cannot see the userspace datapaths inside vswitchd. This change moves most of the code of utilities/ovs-dpctl.c in lib/dpctl.c. Both the ovs-dpctl command and the ovs-appctl dpctl/* commands make calls to lib/dpctl.c functions, to interact with datapaths. The code from utilities/ovs-dpctl.c has been moved to lib/dpctl.c and has been changed for different reasons: - An exit() call in the old code made perfectly sense. Now (since the code can be run inside vswitchd) it would terminate the daemon. Same reasoning can be applied to ovs_fatal_*() calls. - The lib/dpctl.c code _should_ not leak memory. - All the print* have been replaced with a function pointer provided by the caller, since this code can be run in the ovs-dpctl process (in which case we need to print to stdout) or in response to a unixctl request (and in this case we need to send everything through a socket, using JSON encapsulation). The syntax is ovs-appctl dpctl/(COMMAND) [OPTIONS] [PARAMETERS] while the ovs-dpctl syntax (which _should_ remain the same after this change) is ovs-dpctl [OPTIONS] (COMMAND) [PARAMETERS] Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com> [blp@nicira.com made stylistic and documentation changes] Signed-off-by: Ben Pfaff <blp@nicira.com>
2014-07-17 17:26:00 -07:00
dpctl_unixctl_register();
tnl_port_map_init();
tnl_neigh_cache_init();
route_table_init();
for (i = 0; i < ARRAY_SIZE(base_dpif_classes); i++) {
dp_register_provider(base_dpif_classes[i]);
}
ovsthread_once_done(&once);
}
}
static int
dp_register_provider__(const struct dpif_class *new_class)
{
struct registered_dpif_class *registered_class;
int error;
if (sset_contains(&dpif_disallowed, new_class->type)) {
VLOG_DBG("attempted to register disallowed provider: %s",
new_class->type);
return EINVAL;
}
if (shash_find(&dpif_classes, new_class->type)) {
VLOG_WARN("attempted to register duplicate datapath provider: %s",
new_class->type);
return EEXIST;
}
error = new_class->init ? new_class->init() : 0;
if (error) {
VLOG_WARN("failed to initialize %s datapath class: %s",
new_class->type, ovs_strerror(error));
return error;
}
registered_class = xmalloc(sizeof *registered_class);
registered_class->dpif_class = new_class;
registered_class->refcount = 0;
shash_add(&dpif_classes, new_class->type, registered_class);
return 0;
}
/* Registers a new datapath provider. After successful registration, new
* datapaths of that type can be opened using dpif_open(). */
int
dp_register_provider(const struct dpif_class *new_class)
{
int error;
ovs_mutex_lock(&dpif_mutex);
error = dp_register_provider__(new_class);
ovs_mutex_unlock(&dpif_mutex);
return error;
}
/* Unregisters a datapath provider. 'type' must have been previously
* registered and not currently be in use by any dpifs. After unregistration
* new datapaths of that type cannot be opened using dpif_open(). */
static int
dp_unregister_provider__(const char *type)
{
struct shash_node *node;
struct registered_dpif_class *registered_class;
node = shash_find(&dpif_classes, type);
if (!node) {
return EAFNOSUPPORT;
}
registered_class = node->data;
if (registered_class->refcount) {
VLOG_WARN("attempted to unregister in use datapath provider: %s", type);
return EBUSY;
}
shash_delete(&dpif_classes, node);
free(registered_class);
return 0;
}
/* Unregisters a datapath provider. 'type' must have been previously
* registered and not currently be in use by any dpifs. After unregistration
* new datapaths of that type cannot be opened using dpif_open(). */
int
dp_unregister_provider(const char *type)
{
int error;
dp_initialize();
ovs_mutex_lock(&dpif_mutex);
error = dp_unregister_provider__(type);
ovs_mutex_unlock(&dpif_mutex);
return error;
}
/* Disallows a provider. Causes future calls of dp_register_provider() with
* a dpif_class which implements 'type' to fail. */
void
dp_disallow_provider(const char *type)
{
ovs_mutex_lock(&dpif_mutex);
sset_add(&dpif_disallowed, type);
ovs_mutex_unlock(&dpif_mutex);
}
/* Adds the types of all currently registered datapath providers to 'types'.
* The caller must first initialize the sset. */
void
dp_enumerate_types(struct sset *types)
{
struct shash_node *node;
dp_initialize();
ovs_mutex_lock(&dpif_mutex);
SHASH_FOR_EACH(node, &dpif_classes) {
const struct registered_dpif_class *registered_class = node->data;
sset_add(types, registered_class->dpif_class->type);
}
ovs_mutex_unlock(&dpif_mutex);
}
static void
dp_class_unref(struct registered_dpif_class *rc)
{
ovs_mutex_lock(&dpif_mutex);
ovs_assert(rc->refcount);
rc->refcount--;
ovs_mutex_unlock(&dpif_mutex);
}
static struct registered_dpif_class *
dp_class_lookup(const char *type)
{
struct registered_dpif_class *rc;
ovs_mutex_lock(&dpif_mutex);
rc = shash_find_data(&dpif_classes, type);
if (rc) {
rc->refcount++;
}
ovs_mutex_unlock(&dpif_mutex);
return rc;
}
/* Clears 'names' and enumerates the names of all known created datapaths with
* the given 'type'. The caller must first initialize the sset. Returns 0 if
* successful, otherwise a positive errno value.
*
* Some kinds of datapaths might not be practically enumerable. This is not
* considered an error. */
int
dp_enumerate_names(const char *type, struct sset *names)
{
struct registered_dpif_class *registered_class;
const struct dpif_class *dpif_class;
int error;
dp_initialize();
sset_clear(names);
registered_class = dp_class_lookup(type);
if (!registered_class) {
VLOG_WARN("could not enumerate unknown type: %s", type);
return EAFNOSUPPORT;
}
dpif_class = registered_class->dpif_class;
error = (dpif_class->enumerate
? dpif_class->enumerate(names, dpif_class)
: 0);
if (error) {
VLOG_WARN("failed to enumerate %s datapaths: %s", dpif_class->type,
ovs_strerror(error));
}
dp_class_unref(registered_class);
return error;
}
/* Parses 'datapath_name_', which is of the form [type@]name into its
* component pieces. 'name' and 'type' must be freed by the caller.
*
* The returned 'type' is normalized, as if by dpif_normalize_type(). */
void
dp_parse_name(const char *datapath_name_, char **name, char **type)
{
char *datapath_name = xstrdup(datapath_name_);
char *separator;
separator = strchr(datapath_name, '@');
if (separator) {
*separator = '\0';
*type = datapath_name;
*name = xstrdup(dpif_normalize_type(separator + 1));
} else {
*name = datapath_name;
*type = xstrdup(dpif_normalize_type(NULL));
}
}
static int
do_open(const char *name, const char *type, bool create, struct dpif **dpifp)
{
struct dpif *dpif = NULL;
int error;
struct registered_dpif_class *registered_class;
dp_initialize();
type = dpif_normalize_type(type);
registered_class = dp_class_lookup(type);
if (!registered_class) {
VLOG_WARN("could not create datapath %s of unknown type %s", name,
type);
error = EAFNOSUPPORT;
goto exit;
}
error = registered_class->dpif_class->open(registered_class->dpif_class,
name, create, &dpif);
if (!error) {
const char *dpif_type_str = dpif_normalize_type(dpif_type(dpif));
struct dpif_port_dump port_dump;
struct dpif_port dpif_port;
ovs_assert(dpif->dpif_class == registered_class->dpif_class);
DPIF_PORT_FOR_EACH(&dpif_port, &port_dump, dpif) {
struct netdev *netdev;
int err;
if (dpif_is_tap_port(dpif_port.type)) {
continue;
}
err = netdev_open(dpif_port.name, dpif_port.type, &netdev);
if (!err) {
bridge: Fix incorrect configuration of netdev's dpif type. netdev_set_dpif_type() can only be used with a normalized dpif type as an argument, which is a constant static string derived from a type of a dpif_class or a constant string "system". Usage of a same constant string allows netdev-offload module to compare types by simply comparing pointers. OTOH, 'br->ofproto->type' is a dynamic string that: a. Can be NULL. b. Even if not NULL and equal, can be a different dynamically allocated string. Both these qualities breaks assumptions made by all other modules related to HW offload, breaking the functionality. Fix that by moving netdev_set_dpif_type() to dpif.c and calling with a correct constant string as an argument. The call moved from bridge.c to dpif.c, because we need to have access to the dpif class, but bridge.c should not. Not trying to set the dpif_type inside the netdev_ports_insert(), because it's used now outside the offloading context. So, it's cleaner to move the netdev_set_dpif_type() call outside of the netdev-offload module. Additionally removed the redundant call from the netdev_ports_insert() and refactored the function, since it doesn't need an extra argument anymore. Fixes: 4f19a78a61c5 ("netdev-vport: Fix userspace tunnel ioctl(SIOCGIFINDEX) info logs.") Reported-by: Roi Dayan <roid@nvidia.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-December/390117.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Reviewed-by: Lin Huang <linhuang@ruijie.com.cn> Acked-by: Roi Dayan <roid@nvidia.com>
2021-12-17 01:05:57 +01:00
netdev_set_dpif_type(netdev, dpif_type_str);
netdev_ports_insert(netdev, &dpif_port);
netdev_close(netdev);
} else {
VLOG_WARN("could not open netdev %s type %s: %s",
dpif_port.name, dpif_port.type, ovs_strerror(err));
}
}
} else {
dp_class_unref(registered_class);
}
exit:
*dpifp = error ? NULL : dpif;
return error;
}
/* Tries to open an existing datapath named 'name' and type 'type'. Will fail
* if no datapath with 'name' and 'type' exists. 'type' may be either NULL or
* the empty string to specify the default system type. Returns 0 if
* successful, otherwise a positive errno value. On success stores a pointer
* to the datapath in '*dpifp', otherwise a null pointer. */
int
dpif_open(const char *name, const char *type, struct dpif **dpifp)
{
return do_open(name, type, false, dpifp);
}
/* Tries to create and open a new datapath with the given 'name' and 'type'.
* 'type' may be either NULL or the empty string to specify the default system
* type. Will fail if a datapath with 'name' and 'type' already exists.
* Returns 0 if successful, otherwise a positive errno value. On success
* stores a pointer to the datapath in '*dpifp', otherwise a null pointer. */
int
dpif_create(const char *name, const char *type, struct dpif **dpifp)
{
return do_open(name, type, true, dpifp);
}
/* Tries to open a datapath with the given 'name' and 'type', creating it if it
* does not exist. 'type' may be either NULL or the empty string to specify
* the default system type. Returns 0 if successful, otherwise a positive
* errno value. On success stores a pointer to the datapath in '*dpifp',
* otherwise a null pointer. */
int
dpif_create_and_open(const char *name, const char *type, struct dpif **dpifp)
{
int error;
error = dpif_create(name, type, dpifp);
if (error == EEXIST || error == EBUSY) {
error = dpif_open(name, type, dpifp);
if (error) {
VLOG_WARN("datapath %s already exists but cannot be opened: %s",
name, ovs_strerror(error));
}
} else if (error) {
VLOG_WARN("failed to create datapath %s: %s",
name, ovs_strerror(error));
}
return error;
}
static void
dpif_remove_netdev_ports(struct dpif *dpif) {
const char *dpif_type_str = dpif_normalize_type(dpif_type(dpif));
struct dpif_port_dump port_dump;
struct dpif_port dpif_port;
DPIF_PORT_FOR_EACH (&dpif_port, &port_dump, dpif) {
if (!dpif_is_tap_port(dpif_port.type)) {
netdev_ports_remove(dpif_port.port_no, dpif_type_str);
}
}
}
/* Closes and frees the connection to 'dpif'. Does not destroy the datapath
* itself; call dpif_delete() first, instead, if that is desirable. */
void
dpif_close(struct dpif *dpif)
{
if (dpif) {
struct registered_dpif_class *rc;
rc = shash_find_data(&dpif_classes, dpif->dpif_class->type);
if (rc->refcount == 1) {
dpif_remove_netdev_ports(dpif);
}
dpif_uninit(dpif, true);
dp_class_unref(rc);
}
}
/* Performs periodic work needed by 'dpif'. */
bool
dpif_run(struct dpif *dpif)
{
if (dpif->dpif_class->run) {
return dpif->dpif_class->run(dpif);
}
return false;
}
/* Arranges for poll_block() to wake up when dp_run() needs to be called for
* 'dpif'. */
void
dpif_wait(struct dpif *dpif)
{
if (dpif->dpif_class->wait) {
dpif->dpif_class->wait(dpif);
}
}
/* Returns the name of datapath 'dpif' prefixed with the type
* (for use in log messages). */
const char *
dpif_name(const struct dpif *dpif)
{
return dpif->full_name;
}
/* Returns the name of datapath 'dpif' without the type
* (for use in device names). */
const char *
dpif_base_name(const struct dpif *dpif)
{
return dpif->base_name;
}
/* Returns the type of datapath 'dpif'. */
const char *
dpif_type(const struct dpif *dpif)
{
return dpif->dpif_class->type;
}
vswitchd: Always cleanup userspace datapath. 'netdev' datapath is implemented within ovs-vswitchd process and can not exist without it, so it should be gracefully terminated with a full cleanup of resources upon ovs-vswitchd exit. This change forces dpif cleanup for 'netdev' datapath regardless of passing '--cleanup' to 'ovs-appctl exit'. Such solution allowes to not pass this additional option everytime for userspace datapath installations and also allowes to not terminate system datapath in setups where both datapaths runs at the same time. The main part is that dpif_port_del() will lead to netdev_close() and subsequent netdev_class->destroy(dev) which will stop HW NICs and free their resources. For vhost-user interfaces it will invoke vhost driver unregistering with a properly closed vhost-user connection. For upcoming AF_XDP netdev this will allow to gracefully destroy xdp sockets and unload xdp programs from linux interfaces. Another important thing is that port deletion will also trigger flushing of flows offloaded to HW NICs. Exception made for 'internal' ports that could have user ip/route configuration. These ports will not be removed without '--cleanup'. This change fixes OVS disappearing from the DPDK point of view (keeping HW NICs improperly configured, sudden closing of vhost-user connections) and will help with linux devices clearing with upcoming AF_XDP netdev support. Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Tested-by: William Tu <u9012063@gmail.com> Acked-by: Flavio Leitner <fbl@sysclose.org> Acked-by: Ben Pfaff <blp@ovn.org>
2019-06-24 17:20:17 +03:00
/* Checks if datapath 'dpif' requires cleanup. */
bool
dpif_cleanup_required(const struct dpif *dpif)
{
return dpif->dpif_class->cleanup_required;
}
/* Returns the fully spelled out name for the given datapath 'type'.
*
* Normalized type string can be compared with strcmp(). Unnormalized type
* string might be the same even if they have different spellings. */
const char *
dpif_normalize_type(const char *type)
{
return type && type[0] ? type : "system";
}
/* Destroys the datapath that 'dpif' is connected to, first removing all of its
* ports. After calling this function, it does not make sense to pass 'dpif'
* to any functions other than dpif_name() or dpif_close(). */
int
dpif_delete(struct dpif *dpif)
{
int error;
COVERAGE_INC(dpif_destroy);
error = dpif->dpif_class->destroy(dpif);
log_operation(dpif, "delete", error);
return error;
}
/* Retrieves statistics for 'dpif' into 'stats'. Returns 0 if successful,
* otherwise a positive errno value. */
int
dpif_get_dp_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
{
int error = dpif->dpif_class->get_stats(dpif, stats);
if (error) {
memset(stats, 0, sizeof *stats);
}
log_operation(dpif, "get_stats", error);
return error;
}
int
dpif_set_features(struct dpif *dpif, uint32_t new_features)
{
int error = dpif->dpif_class->set_features(dpif, new_features);
log_operation(dpif, "set_features", error);
return error;
}
const char *
dpif_port_open_type(const char *datapath_type, const char *port_type)
{
struct registered_dpif_class *rc;
datapath_type = dpif_normalize_type(datapath_type);
ovs_mutex_lock(&dpif_mutex);
rc = shash_find_data(&dpif_classes, datapath_type);
if (rc && rc->dpif_class->port_open_type) {
port_type = rc->dpif_class->port_open_type(rc->dpif_class, port_type);
}
ovs_mutex_unlock(&dpif_mutex);
return port_type;
}
/* Attempts to add 'netdev' as a port on 'dpif'. If 'port_nop' is
* non-null and its value is not ODPP_NONE, then attempts to use the
* value as the port number.
*
* If successful, returns 0 and sets '*port_nop' to the new port's port
* number (if 'port_nop' is non-null). On failure, returns a positive
* errno value and sets '*port_nop' to ODPP_NONE (if 'port_nop' is
* non-null). */
int
dpif_port_add(struct dpif *dpif, struct netdev *netdev, odp_port_t *port_nop)
{
const char *netdev_name = netdev_get_name(netdev);
odp_port_t port_no = ODPP_NONE;
int error;
COVERAGE_INC(dpif_port_add);
if (port_nop) {
port_no = *port_nop;
}
error = dpif->dpif_class->port_add(dpif, netdev, &port_no);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: added %s as port %"PRIu32,
dpif_name(dpif), netdev_name, port_no);
if (!dpif_is_tap_port(netdev_get_type(netdev))) {
const char *dpif_type_str = dpif_normalize_type(dpif_type(dpif));
struct dpif_port dpif_port;
bridge: Fix incorrect configuration of netdev's dpif type. netdev_set_dpif_type() can only be used with a normalized dpif type as an argument, which is a constant static string derived from a type of a dpif_class or a constant string "system". Usage of a same constant string allows netdev-offload module to compare types by simply comparing pointers. OTOH, 'br->ofproto->type' is a dynamic string that: a. Can be NULL. b. Even if not NULL and equal, can be a different dynamically allocated string. Both these qualities breaks assumptions made by all other modules related to HW offload, breaking the functionality. Fix that by moving netdev_set_dpif_type() to dpif.c and calling with a correct constant string as an argument. The call moved from bridge.c to dpif.c, because we need to have access to the dpif class, but bridge.c should not. Not trying to set the dpif_type inside the netdev_ports_insert(), because it's used now outside the offloading context. So, it's cleaner to move the netdev_set_dpif_type() call outside of the netdev-offload module. Additionally removed the redundant call from the netdev_ports_insert() and refactored the function, since it doesn't need an extra argument anymore. Fixes: 4f19a78a61c5 ("netdev-vport: Fix userspace tunnel ioctl(SIOCGIFINDEX) info logs.") Reported-by: Roi Dayan <roid@nvidia.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-December/390117.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Reviewed-by: Lin Huang <linhuang@ruijie.com.cn> Acked-by: Roi Dayan <roid@nvidia.com>
2021-12-17 01:05:57 +01:00
netdev_set_dpif_type(netdev, dpif_type_str);
dpif_port.type = CONST_CAST(char *, netdev_get_type(netdev));
dpif_port.name = CONST_CAST(char *, netdev_name);
dpif_port.port_no = port_no;
bridge: Fix incorrect configuration of netdev's dpif type. netdev_set_dpif_type() can only be used with a normalized dpif type as an argument, which is a constant static string derived from a type of a dpif_class or a constant string "system". Usage of a same constant string allows netdev-offload module to compare types by simply comparing pointers. OTOH, 'br->ofproto->type' is a dynamic string that: a. Can be NULL. b. Even if not NULL and equal, can be a different dynamically allocated string. Both these qualities breaks assumptions made by all other modules related to HW offload, breaking the functionality. Fix that by moving netdev_set_dpif_type() to dpif.c and calling with a correct constant string as an argument. The call moved from bridge.c to dpif.c, because we need to have access to the dpif class, but bridge.c should not. Not trying to set the dpif_type inside the netdev_ports_insert(), because it's used now outside the offloading context. So, it's cleaner to move the netdev_set_dpif_type() call outside of the netdev-offload module. Additionally removed the redundant call from the netdev_ports_insert() and refactored the function, since it doesn't need an extra argument anymore. Fixes: 4f19a78a61c5 ("netdev-vport: Fix userspace tunnel ioctl(SIOCGIFINDEX) info logs.") Reported-by: Roi Dayan <roid@nvidia.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-December/390117.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Reviewed-by: Lin Huang <linhuang@ruijie.com.cn> Acked-by: Roi Dayan <roid@nvidia.com>
2021-12-17 01:05:57 +01:00
netdev_ports_insert(netdev, &dpif_port);
}
} else {
VLOG_WARN_RL(&error_rl, "%s: failed to add %s as port: %s",
dpif_name(dpif), netdev_name, ovs_strerror(error));
port_no = ODPP_NONE;
}
if (port_nop) {
*port_nop = port_no;
}
return error;
}
/* Attempts to remove 'dpif''s port number 'port_no'. Returns 0 if successful,
* otherwise a positive errno value. */
int
dpif_port_del(struct dpif *dpif, odp_port_t port_no, bool local_delete)
{
int error = 0;
COVERAGE_INC(dpif_port_del);
if (!local_delete) {
error = dpif->dpif_class->port_del(dpif, port_no);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: port_del(%"PRIu32")",
dpif_name(dpif), port_no);
} else {
log_operation(dpif, "port_del", error);
}
}
netdev_ports_remove(port_no, dpif_normalize_type(dpif_type(dpif)));
return error;
}
/* Makes a deep copy of 'src' into 'dst'. */
void
dpif_port_clone(struct dpif_port *dst, const struct dpif_port *src)
{
dst->name = xstrdup(src->name);
dst->type = xstrdup(src->type);
dst->port_no = src->port_no;
}
/* Frees memory allocated to members of 'dpif_port'.
*
* Do not call this function on a dpif_port obtained from
* dpif_port_dump_next(): that function retains ownership of the data in the
* dpif_port. */
void
dpif_port_destroy(struct dpif_port *dpif_port)
{
free(dpif_port->name);
free(dpif_port->type);
}
/* Checks if port named 'devname' exists in 'dpif'. If so, returns
* true; otherwise, returns false. */
bool
dpif_port_exists(const struct dpif *dpif, const char *devname)
{
int error = dpif->dpif_class->port_query_by_name(dpif, devname, NULL);
if (error != 0 && error != ENODEV) {
VLOG_WARN_RL(&error_rl, "%s: failed to query port %s: %s",
dpif_name(dpif), devname, ovs_strerror(error));
}
return !error;
}
/* Refreshes configuration of 'dpif's port. */
int
dpif_port_set_config(struct dpif *dpif, odp_port_t port_no,
const struct smap *cfg)
{
int error = 0;
if (dpif->dpif_class->port_set_config) {
error = dpif->dpif_class->port_set_config(dpif, port_no, cfg);
if (error) {
log_operation(dpif, "port_set_config", error);
}
}
return error;
}
/* Looks up port number 'port_no' in 'dpif'. On success, returns 0 and
* initializes '*port' appropriately; on failure, returns a positive errno
* value.
*
ofproto-dpif: Fix removal of renamed datapath ports. OVS configuration is based on port names and OpenFlow port numbers. Names are stored in the database and translated later to OF ports. On the datapath level, each port has a name and a datapath port number. Port name in the database has to match datapath port name, unless it's a tunnel port. If a datapath port is renamed with 'ip link set DEV name NAME', ovs-vswitchd will wake up, destroy all the OpenFlow-related structures and clean other things up. This is because the port no longer represents the port from a database due to a name difference. However, ovs-vswitch will not actually remove the port from the datapath, because it thinks that this port is no longer there. This is happening because lookup is performed by name and the name have changed. As a result we have a port in a datapath that is not related to any port known to ovs-vswitchd and ovs-vswitchd can't remove it. This port also occupies a datapath port number and prevents the port to be added back with a new name. Fix that by performing lookup by a datapath port number during the port destruction. The name was used only to avoid spurious warnings in a normal case where the port was successfully deleted by other parts of OVS. Adding an extra flag to avoid these warnings instead. Fixes: 02f8d6460afd ("ofproto-dpif: Query port existence by name to prevent warnings.") Reported-at: https://github.com/openvswitch/ovs-issues/issues/284 Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-19 18:14:04 +02:00
* Retuns ENODEV if the port doesn't exist. Will not log a warning in this
* case unless 'warn_if_not_found' is true.
*
* The caller owns the data in 'port' and must free it with
* dpif_port_destroy() when it is no longer needed. */
int
dpif_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
ofproto-dpif: Fix removal of renamed datapath ports. OVS configuration is based on port names and OpenFlow port numbers. Names are stored in the database and translated later to OF ports. On the datapath level, each port has a name and a datapath port number. Port name in the database has to match datapath port name, unless it's a tunnel port. If a datapath port is renamed with 'ip link set DEV name NAME', ovs-vswitchd will wake up, destroy all the OpenFlow-related structures and clean other things up. This is because the port no longer represents the port from a database due to a name difference. However, ovs-vswitch will not actually remove the port from the datapath, because it thinks that this port is no longer there. This is happening because lookup is performed by name and the name have changed. As a result we have a port in a datapath that is not related to any port known to ovs-vswitchd and ovs-vswitchd can't remove it. This port also occupies a datapath port number and prevents the port to be added back with a new name. Fix that by performing lookup by a datapath port number during the port destruction. The name was used only to avoid spurious warnings in a normal case where the port was successfully deleted by other parts of OVS. Adding an extra flag to avoid these warnings instead. Fixes: 02f8d6460afd ("ofproto-dpif: Query port existence by name to prevent warnings.") Reported-at: https://github.com/openvswitch/ovs-issues/issues/284 Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-19 18:14:04 +02:00
struct dpif_port *port, bool warn_if_not_found)
{
int error = dpif->dpif_class->port_query_by_number(dpif, port_no, port);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: port %"PRIu32" is device %s",
dpif_name(dpif), port_no, port->name);
} else {
memset(port, 0, sizeof *port);
ofproto-dpif: Fix removal of renamed datapath ports. OVS configuration is based on port names and OpenFlow port numbers. Names are stored in the database and translated later to OF ports. On the datapath level, each port has a name and a datapath port number. Port name in the database has to match datapath port name, unless it's a tunnel port. If a datapath port is renamed with 'ip link set DEV name NAME', ovs-vswitchd will wake up, destroy all the OpenFlow-related structures and clean other things up. This is because the port no longer represents the port from a database due to a name difference. However, ovs-vswitch will not actually remove the port from the datapath, because it thinks that this port is no longer there. This is happening because lookup is performed by name and the name have changed. As a result we have a port in a datapath that is not related to any port known to ovs-vswitchd and ovs-vswitchd can't remove it. This port also occupies a datapath port number and prevents the port to be added back with a new name. Fix that by performing lookup by a datapath port number during the port destruction. The name was used only to avoid spurious warnings in a normal case where the port was successfully deleted by other parts of OVS. Adding an extra flag to avoid these warnings instead. Fixes: 02f8d6460afd ("ofproto-dpif: Query port existence by name to prevent warnings.") Reported-at: https://github.com/openvswitch/ovs-issues/issues/284 Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-19 18:14:04 +02:00
if (error == ENODEV && !warn_if_not_found) {
VLOG_DBG_RL(&dpmsg_rl, "%s: failed to query port %"PRIu32": %s",
dpif_name(dpif), port_no, ovs_strerror(error));
} else {
VLOG_WARN_RL(&error_rl, "%s: failed to query port %"PRIu32": %s",
dpif_name(dpif), port_no, ovs_strerror(error));
}
}
return error;
}
/* Looks up port named 'devname' in 'dpif'. On success, returns 0 and
* initializes '*port' appropriately; on failure, returns a positive errno
* value.
*
* Retuns ENODEV if the port doesn't exist.
*
* The caller owns the data in 'port' and must free it with
* dpif_port_destroy() when it is no longer needed. */
int
dpif_port_query_by_name(const struct dpif *dpif, const char *devname,
struct dpif_port *port)
{
int error = dpif->dpif_class->port_query_by_name(dpif, devname, port);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: device %s is on port %"PRIu32,
dpif_name(dpif), devname, port->port_no);
} else {
memset(port, 0, sizeof *port);
/* For ENODEV we use DBG level because the caller is probably
* interested in whether 'dpif' actually has a port 'devname', so that
* it's not an issue worth logging if it doesn't. Other errors are
* uncommon and more likely to indicate a real problem. */
VLOG_RL(&error_rl, error == ENODEV ? VLL_DBG : VLL_WARN,
"%s: failed to query port %s: %s",
dpif_name(dpif), devname, ovs_strerror(error));
}
return error;
}
/* Returns the Netlink PID value to supply in OVS_ACTION_ATTR_USERSPACE
* actions as the OVS_USERSPACE_ATTR_PID attribute's value, for use in
* flows whose packets arrived on port 'port_no'.
*
* A 'port_no' of ODPP_NONE is a special case: it returns a reserved PID, not
* allocated to any port, that the client may use for special purposes.
*
* The return value is only meaningful when DPIF_UC_ACTION has been enabled in
* the 'dpif''s listen mask. It is allowed to change when DPIF_UC_ACTION is
* disabled and then re-enabled, so a client that does that must be prepared to
* update all of the flows that it installed that contain
* OVS_ACTION_ATTR_USERSPACE actions. */
uint32_t
dpif_port_get_pid(const struct dpif *dpif, odp_port_t port_no)
{
return (dpif->dpif_class->port_get_pid
? (dpif->dpif_class->port_get_pid)(dpif, port_no)
: 0);
}
/* Looks up port number 'port_no' in 'dpif'. On success, returns 0 and copies
* the port's name into the 'name_size' bytes in 'name', ensuring that the
* result is null-terminated. On failure, returns a positive errno value and
* makes 'name' the empty string. */
int
dpif_port_get_name(struct dpif *dpif, odp_port_t port_no,
char *name, size_t name_size)
{
struct dpif_port port;
int error;
ovs_assert(name_size > 0);
ofproto-dpif: Fix removal of renamed datapath ports. OVS configuration is based on port names and OpenFlow port numbers. Names are stored in the database and translated later to OF ports. On the datapath level, each port has a name and a datapath port number. Port name in the database has to match datapath port name, unless it's a tunnel port. If a datapath port is renamed with 'ip link set DEV name NAME', ovs-vswitchd will wake up, destroy all the OpenFlow-related structures and clean other things up. This is because the port no longer represents the port from a database due to a name difference. However, ovs-vswitch will not actually remove the port from the datapath, because it thinks that this port is no longer there. This is happening because lookup is performed by name and the name have changed. As a result we have a port in a datapath that is not related to any port known to ovs-vswitchd and ovs-vswitchd can't remove it. This port also occupies a datapath port number and prevents the port to be added back with a new name. Fix that by performing lookup by a datapath port number during the port destruction. The name was used only to avoid spurious warnings in a normal case where the port was successfully deleted by other parts of OVS. Adding an extra flag to avoid these warnings instead. Fixes: 02f8d6460afd ("ofproto-dpif: Query port existence by name to prevent warnings.") Reported-at: https://github.com/openvswitch/ovs-issues/issues/284 Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-07-19 18:14:04 +02:00
error = dpif_port_query_by_number(dpif, port_no, &port, true);
if (!error) {
ovs_strlcpy(name, port.name, name_size);
dpif_port_destroy(&port);
} else {
*name = '\0';
}
return error;
}
2011-01-10 13:12:12 -08:00
/* Initializes 'dump' to begin dumping the ports in a dpif.
*
2011-01-10 13:12:12 -08:00
* This function provides no status indication. An error status for the entire
* dump operation is provided when it is completed by calling
* dpif_port_dump_done().
*/
void
dpif_port_dump_start(struct dpif_port_dump *dump, const struct dpif *dpif)
{
dump->dpif = dpif;
dump->error = dpif->dpif_class->port_dump_start(dpif, &dump->state);
log_operation(dpif, "port_dump_start", dump->error);
}
/* Attempts to retrieve another port from 'dump', which must have been
* initialized with dpif_port_dump_start(). On success, stores a new dpif_port
2011-01-10 13:12:12 -08:00
* into 'port' and returns true. On failure, returns false.
*
2011-01-10 13:12:12 -08:00
* Failure might indicate an actual error or merely that the last port has been
* dumped. An error status for the entire dump operation is provided when it
* is completed by calling dpif_port_dump_done().
*
* The dpif owns the data stored in 'port'. It will remain valid until at
* least the next time 'dump' is passed to dpif_port_dump_next() or
* dpif_port_dump_done(). */
2011-01-10 13:12:12 -08:00
bool
dpif_port_dump_next(struct dpif_port_dump *dump, struct dpif_port *port)
{
2011-01-10 13:12:12 -08:00
const struct dpif *dpif = dump->dpif;
2011-01-10 13:12:12 -08:00
if (dump->error) {
return false;
}
2011-01-10 13:12:12 -08:00
dump->error = dpif->dpif_class->port_dump_next(dpif, dump->state, port);
if (dump->error == EOF) {
VLOG_DBG_RL(&dpmsg_rl, "%s: dumped all ports", dpif_name(dpif));
} else {
log_operation(dpif, "port_dump_next", dump->error);
}
2011-01-10 13:12:12 -08:00
if (dump->error) {
dpif->dpif_class->port_dump_done(dpif, dump->state);
return false;
}
2011-01-10 13:12:12 -08:00
return true;
}
2011-01-10 13:12:12 -08:00
/* Completes port table dump operation 'dump', which must have been initialized
* with dpif_port_dump_start(). Returns 0 if the dump operation was
* error-free, otherwise a positive errno value describing the problem. */
int
dpif_port_dump_done(struct dpif_port_dump *dump)
{
const struct dpif *dpif = dump->dpif;
if (!dump->error) {
dump->error = dpif->dpif_class->port_dump_done(dpif, dump->state);
log_operation(dpif, "port_dump_done", dump->error);
}
2011-01-10 13:12:12 -08:00
return dump->error == EOF ? 0 : dump->error;
}
/* Polls for changes in the set of ports in 'dpif'. If the set of ports in
* 'dpif' has changed, this function does one of the following:
*
* - Stores the name of the device that was added to or deleted from 'dpif' in
* '*devnamep' and returns 0. The caller is responsible for freeing
* '*devnamep' (with free()) when it no longer needs it.
*
* - Returns ENOBUFS and sets '*devnamep' to NULL.
*
* This function may also return 'false positives', where it returns 0 and
* '*devnamep' names a device that was not actually added or deleted or it
* returns ENOBUFS without any change.
*
* Returns EAGAIN if the set of ports in 'dpif' has not changed. May also
* return other positive errno values to indicate that something has gone
* wrong. */
int
dpif_port_poll(const struct dpif *dpif, char **devnamep)
{
int error = dpif->dpif_class->port_poll(dpif, devnamep);
if (error) {
*devnamep = NULL;
}
return error;
}
/* Arranges for the poll loop to wake up when port_poll(dpif) will return a
* value other than EAGAIN. */
void
dpif_port_poll_wait(const struct dpif *dpif)
{
dpif->dpif_class->port_poll_wait(dpif);
}
/* Extracts the flow stats for a packet. The 'flow' and 'packet'
* arguments must have been initialized through a call to flow_extract().
* 'used' is stored into stats->used. */
void
dpif_flow_stats_extract(const struct flow *flow, const struct dp_packet *packet,
long long int used, struct dpif_flow_stats *stats)
{
stats->tcp_flags = ntohs(flow->tcp_flags);
stats->n_bytes = dp_packet_size(packet);
stats->n_packets = 1;
stats->used = used;
}
/* Appends a human-readable representation of 'stats' to 's'. */
void
dpif_flow_stats_format(const struct dpif_flow_stats *stats, struct ds *s)
{
ds_put_format(s, "packets:%"PRIu64", bytes:%"PRIu64", used:",
stats->n_packets, stats->n_bytes);
if (stats->used) {
ds_put_format(s, "%.3fs", (time_msec() - stats->used) / 1000.0);
} else {
ds_put_format(s, "never");
}
if (stats->tcp_flags) {
ds_put_cstr(s, ", flags:");
packet_format_tcp_flags(s, stats->tcp_flags);
}
}
/* Deletes all flows from 'dpif'. Returns 0 if successful, otherwise a
* positive errno value. */
int
dpif_flow_flush(struct dpif *dpif)
{
int error;
COVERAGE_INC(dpif_flow_flush);
error = dpif->dpif_class->flow_flush(dpif);
log_operation(dpif, "flow_flush", error);
return error;
}
/* Attempts to install 'key' into the datapath, fetches it, then deletes it.
* Returns true if the datapath supported installing 'flow', false otherwise.
*/
bool
dpif_probe_feature(struct dpif *dpif, const char *name,
const struct ofpbuf *key, const struct ofpbuf *actions,
const ovs_u128 *ufid)
{
struct dpif_flow flow;
struct ofpbuf reply;
uint64_t stub[DPIF_FLOW_BUFSIZE / 8];
bool enable_feature = false;
int error;
const struct nlattr *nl_actions = actions ? actions->data : NULL;
const size_t nl_actions_size = actions ? actions->size : 0;
/* Use DPIF_FP_MODIFY to cover the case where ovs-vswitchd is killed (and
* restarted) at just the right time such that feature probes from the
* previous run are still present in the datapath. */
error = dpif_flow_put(dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY | DPIF_FP_PROBE,
key->data, key->size, NULL, 0,
nl_actions, nl_actions_size,
ufid, NON_PMD_CORE_ID, NULL);
if (error) {
if (error != EINVAL && error != EOVERFLOW) {
VLOG_WARN("%s: %s flow probe failed (%s)",
dpif_name(dpif), name, ovs_strerror(error));
}
return false;
}
ofpbuf_use_stack(&reply, &stub, sizeof stub);
error = dpif_flow_get(dpif, key->data, key->size, ufid,
NON_PMD_CORE_ID, &reply, &flow);
if (!error
&& (!ufid || (flow.ufid_present
&& ovs_u128_equals(*ufid, flow.ufid)))) {
enable_feature = true;
}
error = dpif_flow_del(dpif, key->data, key->size, ufid,
NON_PMD_CORE_ID, NULL);
if (error) {
VLOG_WARN("%s: failed to delete %s feature probe flow",
dpif_name(dpif), name);
}
return enable_feature;
}
/* A dpif_operate() wrapper for performing a single DPIF_OP_FLOW_GET. */
int
dpif_flow_get(struct dpif *dpif,
const struct nlattr *key, size_t key_len, const ovs_u128 *ufid,
const unsigned pmd_id, struct ofpbuf *buf, struct dpif_flow *flow)
{
struct dpif_op *opp;
struct dpif_op op;
op.type = DPIF_OP_FLOW_GET;
op.flow_get.key = key;
op.flow_get.key_len = key_len;
op.flow_get.ufid = ufid;
op.flow_get.pmd_id = pmd_id;
op.flow_get.buffer = buf;
memset(flow, 0, sizeof *flow);
op.flow_get.flow = flow;
op.flow_get.flow->key = key;
op.flow_get.flow->key_len = key_len;
opp = &op;
revalidator: Rebalance offloaded flows based on the pps rate This is the third patch in the patch-set to support dynamic rebalancing of offloaded flows. The dynamic rebalancing functionality is implemented in this patch. The ukeys that are not scheduled for deletion are obtained and passed as input to the rebalancing routine. The rebalancing is done in the context of revalidation leader thread, after all other revalidator threads are done with gathering rebalancing data for flows. For each netdev that is in OOR state, a list of flows - both offloaded and non-offloaded (pending) - is obtained using the ukeys. For each netdev that is in OOR state, the flows are grouped and sorted into offloaded and pending flows. The offloaded flows are sorted in descending order of pps-rate, while pending flows are sorted in ascending order of pps-rate. The rebalancing is done in two phases. In the first phase, we try to offload all pending flows and if that succeeds, the OOR state on the device is cleared. If some (or none) of the pending flows could not be offloaded, then we start replacing an offloaded flow that has a lower pps-rate than a pending flow, until there are no more pending flows with a higher rate than an offloaded flow. The flows that are replaced from the device are added into kernel datapath. A new OVS configuration parameter "offload-rebalance", is added to ovsdb. The default value of this is "false". To enable this feature, set the value of this parameter to "true", which provides packets-per-second rate based policy to dynamically offload and un-offload flows. Note: This option can be enabled only when 'hw-offload' policy is enabled. It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow offload errors (specifically ENOSPC error this feature depends on) reported by an offloaded device are supressed by TC-Flower kernel module. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com> Reviewed-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
dpif_operate(dpif, &opp, 1, DPIF_OFFLOAD_AUTO);
return op.error;
}
/* A dpif_operate() wrapper for performing a single DPIF_OP_FLOW_PUT. */
int
dpif_flow_put(struct dpif *dpif, enum dpif_flow_put_flags flags,
const struct nlattr *key, size_t key_len,
const struct nlattr *mask, size_t mask_len,
const struct nlattr *actions, size_t actions_len,
const ovs_u128 *ufid, const unsigned pmd_id,
struct dpif_flow_stats *stats)
{
struct dpif_op *opp;
struct dpif_op op;
op.type = DPIF_OP_FLOW_PUT;
op.flow_put.flags = flags;
op.flow_put.key = key;
op.flow_put.key_len = key_len;
op.flow_put.mask = mask;
op.flow_put.mask_len = mask_len;
op.flow_put.actions = actions;
op.flow_put.actions_len = actions_len;
op.flow_put.ufid = ufid;
op.flow_put.pmd_id = pmd_id;
op.flow_put.stats = stats;
opp = &op;
revalidator: Rebalance offloaded flows based on the pps rate This is the third patch in the patch-set to support dynamic rebalancing of offloaded flows. The dynamic rebalancing functionality is implemented in this patch. The ukeys that are not scheduled for deletion are obtained and passed as input to the rebalancing routine. The rebalancing is done in the context of revalidation leader thread, after all other revalidator threads are done with gathering rebalancing data for flows. For each netdev that is in OOR state, a list of flows - both offloaded and non-offloaded (pending) - is obtained using the ukeys. For each netdev that is in OOR state, the flows are grouped and sorted into offloaded and pending flows. The offloaded flows are sorted in descending order of pps-rate, while pending flows are sorted in ascending order of pps-rate. The rebalancing is done in two phases. In the first phase, we try to offload all pending flows and if that succeeds, the OOR state on the device is cleared. If some (or none) of the pending flows could not be offloaded, then we start replacing an offloaded flow that has a lower pps-rate than a pending flow, until there are no more pending flows with a higher rate than an offloaded flow. The flows that are replaced from the device are added into kernel datapath. A new OVS configuration parameter "offload-rebalance", is added to ovsdb. The default value of this is "false". To enable this feature, set the value of this parameter to "true", which provides packets-per-second rate based policy to dynamically offload and un-offload flows. Note: This option can be enabled only when 'hw-offload' policy is enabled. It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow offload errors (specifically ENOSPC error this feature depends on) reported by an offloaded device are supressed by TC-Flower kernel module. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com> Reviewed-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
dpif_operate(dpif, &opp, 1, DPIF_OFFLOAD_AUTO);
return op.error;
}
/* A dpif_operate() wrapper for performing a single DPIF_OP_FLOW_DEL. */
int
dpif_flow_del(struct dpif *dpif,
const struct nlattr *key, size_t key_len, const ovs_u128 *ufid,
const unsigned pmd_id, struct dpif_flow_stats *stats)
{
struct dpif_op *opp;
struct dpif_op op;
op.type = DPIF_OP_FLOW_DEL;
op.flow_del.key = key;
op.flow_del.key_len = key_len;
op.flow_del.ufid = ufid;
op.flow_del.pmd_id = pmd_id;
op.flow_del.stats = stats;
op.flow_del.terse = false;
opp = &op;
revalidator: Rebalance offloaded flows based on the pps rate This is the third patch in the patch-set to support dynamic rebalancing of offloaded flows. The dynamic rebalancing functionality is implemented in this patch. The ukeys that are not scheduled for deletion are obtained and passed as input to the rebalancing routine. The rebalancing is done in the context of revalidation leader thread, after all other revalidator threads are done with gathering rebalancing data for flows. For each netdev that is in OOR state, a list of flows - both offloaded and non-offloaded (pending) - is obtained using the ukeys. For each netdev that is in OOR state, the flows are grouped and sorted into offloaded and pending flows. The offloaded flows are sorted in descending order of pps-rate, while pending flows are sorted in ascending order of pps-rate. The rebalancing is done in two phases. In the first phase, we try to offload all pending flows and if that succeeds, the OOR state on the device is cleared. If some (or none) of the pending flows could not be offloaded, then we start replacing an offloaded flow that has a lower pps-rate than a pending flow, until there are no more pending flows with a higher rate than an offloaded flow. The flows that are replaced from the device are added into kernel datapath. A new OVS configuration parameter "offload-rebalance", is added to ovsdb. The default value of this is "false". To enable this feature, set the value of this parameter to "true", which provides packets-per-second rate based policy to dynamically offload and un-offload flows. Note: This option can be enabled only when 'hw-offload' policy is enabled. It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow offload errors (specifically ENOSPC error this feature depends on) reported by an offloaded device are supressed by TC-Flower kernel module. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com> Reviewed-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
dpif_operate(dpif, &opp, 1, DPIF_OFFLOAD_AUTO);
return op.error;
}
/* Creates and returns a new 'struct dpif_flow_dump' for iterating through the
* flows in 'dpif'. If 'terse' is true, then only UFID and statistics will
* be returned in the dump. Otherwise, all fields will be returned.
*
* This function always successfully returns a dpif_flow_dump. Error
* reporting is deferred to dpif_flow_dump_destroy(). */
struct dpif_flow_dump *
dpif_flow_dump_create(const struct dpif *dpif, bool terse,
struct dpif_flow_dump_types *types)
{
return dpif->dpif_class->flow_dump_create(dpif, terse, types);
}
/* Destroys 'dump', which must have been created with dpif_flow_dump_create().
* All dpif_flow_dump_thread structures previously created for 'dump' must
* previously have been destroyed.
*
* Returns 0 if the dump operation was error-free, otherwise a positive errno
* value describing the problem. */
int
dpif_flow_dump_destroy(struct dpif_flow_dump *dump)
{
const struct dpif *dpif = dump->dpif;
int error = dpif->dpif_class->flow_dump_destroy(dump);
log_operation(dpif, "flow_dump_destroy", error);
return error == EOF ? 0 : error;
}
/* Returns new thread-local state for use with dpif_flow_dump_next(). */
struct dpif_flow_dump_thread *
dpif_flow_dump_thread_create(struct dpif_flow_dump *dump)
{
return dump->dpif->dpif_class->flow_dump_thread_create(dump);
}
/* Releases 'thread'. */
void
dpif_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread)
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
{
thread->dpif->dpif_class->flow_dump_thread_destroy(thread);
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
}
/* Attempts to retrieve up to 'max_flows' more flows from 'thread'. Returns 0
* if and only if no flows remained to be retrieved, otherwise a positive
* number reflecting the number of elements in 'flows[]' that were updated.
* The number of flows returned might be less than 'max_flows' because
* fewer than 'max_flows' remained, because this particular datapath does not
* benefit from batching, or because an error occurred partway through
* retrieval. Thus, the caller should continue calling until a 0 return value,
* even if intermediate return values are less than 'max_flows'.
*
* No error status is immediately provided. An error status for the entire
* dump operation is provided when it is completed by calling
* dpif_flow_dump_destroy().
*
* All of the data stored into 'flows' is owned by the datapath, not by the
* caller, and the caller must not modify or free it. The datapath guarantees
* that it remains accessible and unchanged until the first of:
* - The next call to dpif_flow_dump_next() for 'thread', or
* - The next rcu quiescent period. */
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
int
dpif_flow_dump_next(struct dpif_flow_dump_thread *thread,
struct dpif_flow *flows, int max_flows)
datapath: Change listing flows to use an iterator concept. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. In turn, that means that flow keys must become variable-length. This does not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form, because that would require userspace to know how much space to allocate for each flow's key in advance, or to allocate as much space as could possibly be needed. Neither choice is very attractive. This commit prepares for a different solution, by replacing ODP_FLOW_LIST by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath on each call. It is much cleaner to allocate the maximum amount of space for a single flow key than to do so for possibly a very large number of flow keys. As a side effect, this patch also fixes a race condition that sometimes made "ovs-dpctl dump-flows" print an error: previously, flows were listed and then their actions were retrieved, which left a window in which ovs-vswitchd could delete the flow. Now dumping a flow and its actions is a single step, closing that window. Dumping all of the flows in a datapath is no longer an atomic step, so now it is possible to miss some flows or see a single flow twice during iteration, if the flow table is modified by another process. It doesn't look like this should be a problem for ovs-vswitchd. It would be faster to retrieve a number of flows in batch instead of just one at a time, but that will naturally happen later when the kernel datapath interface is changed to use Netlink, so this patch does not bother with it. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2010-12-28 10:39:52 -08:00
{
struct dpif *dpif = thread->dpif;
int n;
ovs_assert(max_flows > 0);
n = dpif->dpif_class->flow_dump_next(thread, flows, max_flows);
if (n > 0) {
struct dpif_flow *f;
for (f = flows; f < &flows[n]
&& should_log_flow_message(&this_module, 0); f++) {
log_flow_message(dpif, 0, &this_module, "flow_dump",
f->key, f->key_len, f->mask, f->mask_len,
&f->ufid, &f->stats, f->actions, f->actions_len);
}
} else {
VLOG_DBG_RL(&dpmsg_rl, "%s: dumped all flows", dpif_name(dpif));
}
return n;
}
struct dpif_execute_helper_aux {
struct dpif *dpif;
const struct flow *flow;
int error;
const struct nlattr *meter_action; /* Non-NULL, if have a meter action. */
};
/* This is called for actions that need the context of the datapath to be
* meaningful. */
static void
dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_,
const struct nlattr *action, bool should_steal)
{
struct dpif_execute_helper_aux *aux = aux_;
int type = nl_attr_type(action);
struct dp_packet *packet = packets_->packets[0];
ovs_assert(dp_packet_batch_size(packets_) == 1);
switch ((enum ovs_action_attr)type) {
case OVS_ACTION_ATTR_METER:
/* Maintain a pointer to the first meter action seen. */
if (!aux->meter_action) {
aux->meter_action = action;
}
break;
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
case OVS_ACTION_ATTR_CT:
case OVS_ACTION_ATTR_OUTPUT:
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
case OVS_ACTION_ATTR_LB_OUTPUT:
case OVS_ACTION_ATTR_TUNNEL_PUSH:
case OVS_ACTION_ATTR_TUNNEL_POP:
case OVS_ACTION_ATTR_USERSPACE:
case OVS_ACTION_ATTR_PSAMPLE:
case OVS_ACTION_ATTR_SAMPLE:
case OVS_ACTION_ATTR_RECIRC: {
struct dpif_execute execute;
struct ofpbuf execute_actions;
uint64_t stub[256 / 8];
struct pkt_metadata *md = &packet->md;
if (flow_tnl_dst_is_set(&md->tunnel) || aux->meter_action) {
ofpbuf_use_stub(&execute_actions, stub, sizeof stub);
if (aux->meter_action) {
const struct nlattr *a = aux->meter_action;
/* XXX: This code collects meter actions since the last action
* execution via the datapath to be executed right before the
* current action that needs to be executed by the datapath.
* This is only an approximation, but better than nothing.
* Fundamentally, we should have a mechanism by which the
* datapath could return the result of the meter action so that
* we could execute them at the right order. */
do {
ofpbuf_put(&execute_actions, a, NLA_ALIGN(a->nla_len));
/* Find next meter action before 'action', if any. */
do {
a = nl_attr_next(a);
} while (a != action &&
nl_attr_type(a) != OVS_ACTION_ATTR_METER);
} while (a != action);
}
/* The Linux kernel datapath throws away the tunnel information
* that we supply as metadata. We have to use a "set" action to
* supply it. */
if (flow_tnl_dst_is_set(&md->tunnel)) {
odp_put_tunnel_action(&md->tunnel, &execute_actions, NULL);
}
ofpbuf_put(&execute_actions, action, NLA_ALIGN(action->nla_len));
execute.actions = execute_actions.data;
execute.actions_len = execute_actions.size;
} else {
execute.actions = action;
execute.actions_len = NLA_ALIGN(action->nla_len);
}
struct dp_packet *clone = NULL;
uint32_t cutlen = dp_packet_get_cutlen(packet);
if (cutlen && (type == OVS_ACTION_ATTR_OUTPUT
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
|| type == OVS_ACTION_ATTR_LB_OUTPUT
|| type == OVS_ACTION_ATTR_TUNNEL_PUSH
|| type == OVS_ACTION_ATTR_TUNNEL_POP
|| type == OVS_ACTION_ATTR_USERSPACE)) {
dp_packet_reset_cutlen(packet);
if (!should_steal) {
packet = clone = dp_packet_clone(packet);
}
dp_packet_set_size(packet, dp_packet_size(packet) - cutlen);
}
execute.packet = packet;
execute.flow = aux->flow;
execute.needs_help = false;
execute.probe = false;
execute.mtu = 0;
dpif: Fix use of uninitialized execute hash. 'dpif_execute_helper_cb' doesn't initilalize the 'hash' field that may be passed down to datapath and might cause execution of a different set of actions, e.g. on recirculation. Thread 6 handler27: Conditional jump or move depends on uninitialised value(s) at 0x53A2C2: dpif_netlink_encode_execute (dpif-netlink.c:1841) by 0x53A2C2: dpif_netlink_operate__ (dpif-netlink.c:1919) by 0x53A82D: dpif_netlink_operate_chunks (dpif-netlink.c:2238) by 0x53A82D: dpif_netlink_operate (dpif-netlink.c:2297) by 0x48135F: dpif_operate (dpif.c:1366) by 0x481923: dpif_execute.part.24 (dpif.c:1320) by 0x481C46: dpif_execute (dpif.c:1312) by 0x481C46: dpif_execute_helper_cb (dpif.c:1243) by 0x4AE943: odp_execute_actions (odp-execute.c:865) by 0x47F272: dpif_execute_with_help (dpif.c:1296) by 0x4812FF: dpif_operate (dpif.c:1422) by 0x442226: handle_upcalls (ofproto-dpif-upcall.c:1617) by 0x442226: recv_upcalls.isra.36 (ofproto-dpif-upcall.c:855) by 0x442351: udpif_upcall_handler (ofproto-dpif-upcall.c:755) by 0x4FDE2C: ovsthread_wrapper (ovs-thread.c:383) by 0x5E19159: start_thread (in /usr/lib64/libpthread-2.28.so) by 0x69ECF72: clone (in /usr/lib64/libc-2.28.so) Uninitialised value was created by a stack allocation at 0x481966: dpif_execute_helper_cb (dpif.c:1159) Additionally added a missing comment to the 'struct dpif_execute'. Fixes: 0442bfb11d6c ("ofproto-dpif-upcall: Echo HASH attribute back to datapath.") Acked-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-04-04 14:43:52 +02:00
execute.hash = 0;
aux->error = dpif_execute(aux->dpif, &execute);
log_execute_message(aux->dpif, &this_module, &execute,
true, aux->error);
dp_packet_delete(clone);
if (flow_tnl_dst_is_set(&md->tunnel) || aux->meter_action) {
ofpbuf_uninit(&execute_actions);
/* Do not re-use the same meters for later output actions. */
aux->meter_action = NULL;
}
break;
}
case OVS_ACTION_ATTR_HASH:
case OVS_ACTION_ATTR_PUSH_VLAN:
case OVS_ACTION_ATTR_POP_VLAN:
case OVS_ACTION_ATTR_PUSH_MPLS:
case OVS_ACTION_ATTR_POP_MPLS:
case OVS_ACTION_ATTR_SET:
case OVS_ACTION_ATTR_SET_MASKED:
case OVS_ACTION_ATTR_TRUNC:
case OVS_ACTION_ATTR_PUSH_ETH:
case OVS_ACTION_ATTR_POP_ETH:
dpif-netdev: Add clone action Add support for userspace datapath clone action. The clone action provides an action envelope to enclose an action list. For example, with actions A, B, C and D, and an action list: A, clone(B, C), D The clone action will ensure that: - D will see the same packet, and any meta states, such as flow, as action B. - D will be executed regardless whether B, or C drops a packet. They can only drop a clone. - When B drops a packet, clone will skip all remaining actions within the clone envelope. This feature is useful when we add meter action later: The meter action can be implemented as a simple action without its own envolop (unlike the sample action). When necessary, the flow translation layer can enclose a meter action in clone. The clone action is very similar with the OpenFlow clone action. This is by design to simplify vswitchd flow translation logic. Without datapath clone, vswitchd simulate the effect by inserting datapath actions to "undo" clone actions. The above flow will be translated into A, B, C, -C, -B, D. However, there are two issues: - The resulting datapath action list may be longer without using clone. - Some actions, such as NAT may not be possible to reverse. This patch implements clone() simply with packet copy. The performance can be improved with later patches, for example, to delay or avoid packet copy if possible. It seems datapath should have enough context to carry out such optimization without the userspace context. Signed-off-by: Andy Zhou <azhou@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2017-01-10 18:13:47 -08:00
case OVS_ACTION_ATTR_CLONE:
case OVS_ACTION_ATTR_PUSH_NSH:
case OVS_ACTION_ATTR_POP_NSH:
case OVS_ACTION_ATTR_CT_CLEAR:
case OVS_ACTION_ATTR_UNSPEC:
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
case OVS_ACTION_ATTR_CHECK_PKT_LEN:
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
case OVS_ACTION_ATTR_DROP:
2021-11-29 11:52:05 +05:30
case OVS_ACTION_ATTR_ADD_MPLS:
case OVS_ACTION_ATTR_DEC_TTL:
case __OVS_ACTION_ATTR_MAX:
OVS_NOT_REACHED();
}
dp_packet_delete_batch(packets_, should_steal);
}
/* Executes 'execute' by performing most of the actions in userspace and
* passing the fully constructed packets to 'dpif' for output and userspace
* actions.
*
* This helps with actions that a given 'dpif' doesn't implement directly. */
static int
dpif_execute_with_help(struct dpif *dpif, struct dpif_execute *execute)
{
struct dpif_execute_helper_aux aux = {dpif, execute->flow, 0, NULL};
struct dp_packet_batch pb;
COVERAGE_INC(dpif_execute_with_help);
dp_packet_batch_init_packet(&pb, execute->packet);
odp_execute_actions(&aux, &pb, false, execute->actions,
execute->actions_len, dpif_execute_helper_cb);
return aux.error;
}
/* Returns true if the datapath needs help executing 'execute'. */
static bool
dpif_execute_needs_help(const struct dpif_execute *execute)
{
return execute->needs_help || nl_attr_oversized(execute->actions_len);
}
/* A dpif_operate() wrapper for performing a single DPIF_OP_EXECUTE. */
int
dpif_execute(struct dpif *dpif, struct dpif_execute *execute)
{
if (execute->actions_len) {
struct dpif_op *opp;
struct dpif_op op;
op.type = DPIF_OP_EXECUTE;
op.execute = *execute;
opp = &op;
revalidator: Rebalance offloaded flows based on the pps rate This is the third patch in the patch-set to support dynamic rebalancing of offloaded flows. The dynamic rebalancing functionality is implemented in this patch. The ukeys that are not scheduled for deletion are obtained and passed as input to the rebalancing routine. The rebalancing is done in the context of revalidation leader thread, after all other revalidator threads are done with gathering rebalancing data for flows. For each netdev that is in OOR state, a list of flows - both offloaded and non-offloaded (pending) - is obtained using the ukeys. For each netdev that is in OOR state, the flows are grouped and sorted into offloaded and pending flows. The offloaded flows are sorted in descending order of pps-rate, while pending flows are sorted in ascending order of pps-rate. The rebalancing is done in two phases. In the first phase, we try to offload all pending flows and if that succeeds, the OOR state on the device is cleared. If some (or none) of the pending flows could not be offloaded, then we start replacing an offloaded flow that has a lower pps-rate than a pending flow, until there are no more pending flows with a higher rate than an offloaded flow. The flows that are replaced from the device are added into kernel datapath. A new OVS configuration parameter "offload-rebalance", is added to ovsdb. The default value of this is "false". To enable this feature, set the value of this parameter to "true", which provides packets-per-second rate based policy to dynamically offload and un-offload flows. Note: This option can be enabled only when 'hw-offload' policy is enabled. It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow offload errors (specifically ENOSPC error this feature depends on) reported by an offloaded device are supressed by TC-Flower kernel module. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com> Reviewed-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
dpif_operate(dpif, &opp, 1, DPIF_OFFLOAD_AUTO);
return op.error;
} else {
return 0;
}
}
/* Executes each of the 'n_ops' operations in 'ops' on 'dpif', in the order in
* which they are specified. Places each operation's results in the "output"
* members documented in comments, and 0 in the 'error' member on success or a
revalidator: Rebalance offloaded flows based on the pps rate This is the third patch in the patch-set to support dynamic rebalancing of offloaded flows. The dynamic rebalancing functionality is implemented in this patch. The ukeys that are not scheduled for deletion are obtained and passed as input to the rebalancing routine. The rebalancing is done in the context of revalidation leader thread, after all other revalidator threads are done with gathering rebalancing data for flows. For each netdev that is in OOR state, a list of flows - both offloaded and non-offloaded (pending) - is obtained using the ukeys. For each netdev that is in OOR state, the flows are grouped and sorted into offloaded and pending flows. The offloaded flows are sorted in descending order of pps-rate, while pending flows are sorted in ascending order of pps-rate. The rebalancing is done in two phases. In the first phase, we try to offload all pending flows and if that succeeds, the OOR state on the device is cleared. If some (or none) of the pending flows could not be offloaded, then we start replacing an offloaded flow that has a lower pps-rate than a pending flow, until there are no more pending flows with a higher rate than an offloaded flow. The flows that are replaced from the device are added into kernel datapath. A new OVS configuration parameter "offload-rebalance", is added to ovsdb. The default value of this is "false". To enable this feature, set the value of this parameter to "true", which provides packets-per-second rate based policy to dynamically offload and un-offload flows. Note: This option can be enabled only when 'hw-offload' policy is enabled. It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow offload errors (specifically ENOSPC error this feature depends on) reported by an offloaded device are supressed by TC-Flower kernel module. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com> Reviewed-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
* positive errno on failure.
*/
void
revalidator: Rebalance offloaded flows based on the pps rate This is the third patch in the patch-set to support dynamic rebalancing of offloaded flows. The dynamic rebalancing functionality is implemented in this patch. The ukeys that are not scheduled for deletion are obtained and passed as input to the rebalancing routine. The rebalancing is done in the context of revalidation leader thread, after all other revalidator threads are done with gathering rebalancing data for flows. For each netdev that is in OOR state, a list of flows - both offloaded and non-offloaded (pending) - is obtained using the ukeys. For each netdev that is in OOR state, the flows are grouped and sorted into offloaded and pending flows. The offloaded flows are sorted in descending order of pps-rate, while pending flows are sorted in ascending order of pps-rate. The rebalancing is done in two phases. In the first phase, we try to offload all pending flows and if that succeeds, the OOR state on the device is cleared. If some (or none) of the pending flows could not be offloaded, then we start replacing an offloaded flow that has a lower pps-rate than a pending flow, until there are no more pending flows with a higher rate than an offloaded flow. The flows that are replaced from the device are added into kernel datapath. A new OVS configuration parameter "offload-rebalance", is added to ovsdb. The default value of this is "false". To enable this feature, set the value of this parameter to "true", which provides packets-per-second rate based policy to dynamically offload and un-offload flows. Note: This option can be enabled only when 'hw-offload' policy is enabled. It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow offload errors (specifically ENOSPC error this feature depends on) reported by an offloaded device are supressed by TC-Flower kernel module. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com> Reviewed-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
enum dpif_offload_type offload_type)
{
if (offload_type == DPIF_OFFLOAD_ALWAYS && !netdev_is_flow_api_enabled()) {
size_t i;
for (i = 0; i < n_ops; i++) {
struct dpif_op *op = ops[i];
op->error = EINVAL;
}
return;
}
while (n_ops > 0) {
size_t chunk;
/* Count 'chunk', the number of ops that can be executed without
* needing any help. Ops that need help should be rare, so we
* expect this to ordinarily be 'n_ops', that is, all the ops. */
for (chunk = 0; chunk < n_ops; chunk++) {
struct dpif_op *op = ops[chunk];
if (op->type == DPIF_OP_EXECUTE
&& dpif_execute_needs_help(&op->execute)) {
break;
}
}
if (chunk) {
/* Execute a chunk full of ops that the dpif provider can
* handle itself, without help. */
size_t i;
revalidator: Rebalance offloaded flows based on the pps rate This is the third patch in the patch-set to support dynamic rebalancing of offloaded flows. The dynamic rebalancing functionality is implemented in this patch. The ukeys that are not scheduled for deletion are obtained and passed as input to the rebalancing routine. The rebalancing is done in the context of revalidation leader thread, after all other revalidator threads are done with gathering rebalancing data for flows. For each netdev that is in OOR state, a list of flows - both offloaded and non-offloaded (pending) - is obtained using the ukeys. For each netdev that is in OOR state, the flows are grouped and sorted into offloaded and pending flows. The offloaded flows are sorted in descending order of pps-rate, while pending flows are sorted in ascending order of pps-rate. The rebalancing is done in two phases. In the first phase, we try to offload all pending flows and if that succeeds, the OOR state on the device is cleared. If some (or none) of the pending flows could not be offloaded, then we start replacing an offloaded flow that has a lower pps-rate than a pending flow, until there are no more pending flows with a higher rate than an offloaded flow. The flows that are replaced from the device are added into kernel datapath. A new OVS configuration parameter "offload-rebalance", is added to ovsdb. The default value of this is "false". To enable this feature, set the value of this parameter to "true", which provides packets-per-second rate based policy to dynamically offload and un-offload flows. Note: This option can be enabled only when 'hw-offload' policy is enabled. It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow offload errors (specifically ENOSPC error this feature depends on) reported by an offloaded device are supressed by TC-Flower kernel module. Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com> Reviewed-by: Sathya Perla <sathya.perla@broadcom.com> Reviewed-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2018-10-18 21:43:14 +05:30
dpif->dpif_class->operate(dpif, ops, chunk, offload_type);
for (i = 0; i < chunk; i++) {
struct dpif_op *op = ops[i];
int error = op->error;
switch (op->type) {
case DPIF_OP_FLOW_PUT: {
struct dpif_flow_put *put = &op->flow_put;
COVERAGE_INC(dpif_flow_put);
log_flow_put_message(dpif, &this_module, put, error);
if (error) {
COVERAGE_INC(dpif_flow_put_error);
if (put->stats) {
memset(put->stats, 0, sizeof *put->stats);
}
}
break;
}
case DPIF_OP_FLOW_GET: {
struct dpif_flow_get *get = &op->flow_get;
COVERAGE_INC(dpif_flow_get);
if (error) {
COVERAGE_INC(dpif_flow_get_error);
memset(get->flow, 0, sizeof *get->flow);
}
log_flow_get_message(dpif, &this_module, get, error);
break;
}
case DPIF_OP_FLOW_DEL: {
struct dpif_flow_del *del = &op->flow_del;
COVERAGE_INC(dpif_flow_del);
log_flow_del_message(dpif, &this_module, del, error);
if (error) {
COVERAGE_INC(dpif_flow_del_error);
if (del->stats) {
memset(del->stats, 0, sizeof *del->stats);
}
}
break;
}
case DPIF_OP_EXECUTE:
COVERAGE_INC(dpif_execute);
log_execute_message(dpif, &this_module, &op->execute,
false, error);
if (error) {
COVERAGE_INC(dpif_execute_error);
}
break;
}
}
ops += chunk;
n_ops -= chunk;
} else {
/* Help the dpif provider to execute one op. */
struct dpif_op *op = ops[0];
COVERAGE_INC(dpif_execute);
op->error = dpif_execute_with_help(dpif, &op->execute);
ops++;
n_ops--;
}
}
}
int dpif_offload_stats_get(struct dpif *dpif,
struct netdev_custom_stats *stats)
{
return (dpif->dpif_class->offload_stats_get
? dpif->dpif_class->offload_stats_get(dpif, stats)
: EOPNOTSUPP);
}
/* Returns a string that represents 'type', for use in log messages. */
const char *
dpif_upcall_type_to_string(enum dpif_upcall_type type)
{
switch (type) {
case DPIF_UC_MISS: return "miss";
case DPIF_UC_ACTION: return "action";
case DPIF_N_UC_TYPES: default: return "<unknown>";
}
}
/* Enables or disables receiving packets with dpif_recv() on 'dpif'. Returns 0
* if successful, otherwise a positive errno value.
*
* Turning packet receive off and then back on may change the Netlink PID
* assignments returned by dpif_port_get_pid(). If the client does this, it
* must update all of the flows that have OVS_ACTION_ATTR_USERSPACE actions
* using the new PID assignment. */
int
dpif_recv_set(struct dpif *dpif, bool enable)
{
int error = 0;
if (dpif->dpif_class->recv_set) {
error = dpif->dpif_class->recv_set(dpif, enable);
log_operation(dpif, "recv_set", error);
}
return error;
}
/* Refreshes the poll loops and Netlink sockets associated to each port,
* when the number of upcall handlers (upcall receiving thread) is changed
* to 'n_handlers' and receiving packets for 'dpif' is enabled by
* recv_set().
*
* Since multiple upcall handlers can read upcalls simultaneously from
* 'dpif', each port can have multiple Netlink sockets, one per upcall
* handler. So, handlers_set() is responsible for the following tasks:
*
* When receiving upcall is enabled, extends or creates the
* configuration to support:
*
* - 'n_handlers' Netlink sockets for each port.
*
* - 'n_handlers' poll loops, one for each upcall handler.
*
* - registering the Netlink sockets for the same upcall handler to
* the corresponding poll loop.
*
* Returns 0 if successful, otherwise a positive errno value. */
int
dpif_handlers_set(struct dpif *dpif, uint32_t n_handlers)
{
int error = 0;
if (dpif->dpif_class->handlers_set) {
error = dpif->dpif_class->handlers_set(dpif, n_handlers);
log_operation(dpif, "handlers_set", error);
}
return error;
}
dpif-netlink: Introduce per-cpu upcall dispatch. The Open vSwitch kernel module uses the upcall mechanism to send packets from kernel space to user space when it misses in the kernel space flow table. The upcall sends packets via a Netlink socket. Currently, a Netlink socket is created for every vport. In this way, there is a 1:1 mapping between a vport and a Netlink socket. When a packet is received by a vport, if it needs to be sent to user space, it is sent via the corresponding Netlink socket. This mechanism, with various iterations of the corresponding user space code, has seen some limitations and issues: * On systems with a large number of vports, there is correspondingly a large number of Netlink sockets which can limit scaling. (https://bugzilla.redhat.com/show_bug.cgi?id=1526306) * Packet reordering on upcalls. (https://bugzilla.redhat.com/show_bug.cgi?id=1844576) * A thundering herd issue. (https://bugzilla.redhat.com/show_bug.cgi?id=1834444) This patch introduces an alternative, feature-negotiated, upcall mode using a per-cpu dispatch rather than a per-vport dispatch. In this mode, the Netlink socket to be used for the upcall is selected based on the CPU of the thread that is executing the upcall. In this way, it resolves the issues above as: a) The number of Netlink sockets scales with the number of CPUs rather than the number of vports. b) Ordering per-flow is maintained as packets are distributed to CPUs based on mechanisms such as RSS and flows are distributed to a single user space thread. c) Packets from a flow can only wake up one user space thread. Reported-at: https://bugzilla.redhat.com/1844576 Signed-off-by: Mark Gray <mark.d.gray@redhat.com> Acked-by: Flavio Leitner <fbl@sysclose.org> Acked-by: Aaron Conole <aconole@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-07-16 06:17:36 -04:00
/* Checks if a certain number of handlers are required.
*
* If a certain number of handlers are required, returns 'true' and sets
* 'n_handlers' to that number of handler threads.
*
* If not, returns 'false'
*/
bool
dpif_number_handlers_required(struct dpif *dpif, uint32_t *n_handlers)
{
if (dpif->dpif_class->number_handlers_required) {
return dpif->dpif_class->number_handlers_required(dpif, n_handlers);
}
return false;
}
void
dpif_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, void *aux)
{
if (dpif->dpif_class->register_dp_purge_cb) {
dpif->dpif_class->register_dp_purge_cb(dpif, cb, aux);
}
}
void
dpif_register_upcall_cb(struct dpif *dpif, upcall_callback *cb, void *aux)
{
if (dpif->dpif_class->register_upcall_cb) {
dpif->dpif_class->register_upcall_cb(dpif, cb, aux);
}
}
void
dpif_enable_upcall(struct dpif *dpif)
{
if (dpif->dpif_class->enable_upcall) {
dpif->dpif_class->enable_upcall(dpif);
}
}
void
dpif_disable_upcall(struct dpif *dpif)
{
if (dpif->dpif_class->disable_upcall) {
dpif->dpif_class->disable_upcall(dpif);
}
}
void
dpif_print_packet(struct dpif *dpif, struct dpif_upcall *upcall)
{
if (!VLOG_DROP_DBG(&dpmsg_rl)) {
struct ds flow;
char *packet;
userspace: Add packet_type in dp_packet and flow This commit adds a packet_type attribute to the structs dp_packet and flow to explicitly carry the type of the packet as prepration for the introduction of the so-called packet type-aware pipeline (PTAP) in OVS. The packet_type is a big-endian 32 bit integer with the encoding as specified in OpenFlow verion 1.5. The upper 16 bits contain the packet type name space. Pre-defined values are defined in openflow-common.h: enum ofp_header_type_namespaces { OFPHTN_ONF = 0, /* ONF namespace. */ OFPHTN_ETHERTYPE = 1, /* ns_type is an Ethertype. */ OFPHTN_IP_PROTO = 2, /* ns_type is a IP protocol number. */ OFPHTN_UDP_TCP_PORT = 3, /* ns_type is a TCP or UDP port. */ OFPHTN_IPV4_OPTION = 4, /* ns_type is an IPv4 option number. */ }; The lower 16 bits specify the actual type in the context of the name space. Only name spaces 0 and 1 will be supported for now. For name space OFPHTN_ONF the relevant packet type is 0 (Ethernet). This is the default packet_type in OVS and the only one supported so far. Packets of type (OFPHTN_ONF, 0) are called Ethernet packets. In name space OFPHTN_ETHERTYPE the type is the Ethertype of the packet. A packet of type (OFPHTN_ETHERTYPE, <Ethertype>) is a standard L2 packet whith the Ethernet header (and any VLAN tags) removed to expose the L3 (or L2.5) payload of the packet. These will simply be called L3 packets. The Ethernet address fields dl_src and dl_dst in struct flow are not applicable for an L3 packet and must be zero. However, to maintain compatibility with the large code base, we have chosen to copy the Ethertype of an L3 packet into the the dl_type field of struct flow. This does not mean that it will be possible to match on dl_type for L3 packets with PTAP later on. Matching must be done on packet_type instead. New dp_packets are initialized with packet_type Ethernet. Ports that receive L3 packets will have to explicitly adjust the packet_type. Signed-off-by: Jean Tourrilhes <jt@labs.hpe.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-04-25 16:29:59 +00:00
packet = ofp_dp_packet_to_string(&upcall->packet);
ds_init(&flow);
odp_flow_key_format(upcall->key, upcall->key_len, &flow);
VLOG_DBG("%s: %s upcall:\n%s\n%s",
dpif_name(dpif), dpif_upcall_type_to_string(upcall->type),
ds_cstr(&flow), packet);
ds_destroy(&flow);
free(packet);
}
}
/* Pass custom configuration to the datapath implementation. Some of the
* changes can be postponed until dpif_run() is called. */
int
dpif_set_config(struct dpif *dpif, const struct smap *cfg)
{
int error = 0;
if (dpif->dpif_class->set_config) {
error = dpif->dpif_class->set_config(dpif, cfg);
if (error) {
log_operation(dpif, "set_config", error);
}
}
return error;
}
/* Polls for an upcall from 'dpif' for an upcall handler. Since there can
* be multiple poll loops, 'handler_id' is needed as index to identify the
* corresponding poll loop. If successful, stores the upcall into '*upcall',
* using 'buf' for storage. Should only be called if 'recv_set' has been used
* to enable receiving packets from 'dpif'.
*
* 'upcall->key' and 'upcall->userdata' point into data in the caller-provided
* 'buf', so their memory cannot be freed separately from 'buf'.
datapath: Report kernel's flow key when passing packets up to userspace. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. This commit takes one step in that direction by making the kernel report its idea of the flow that a packet belongs to whenever it passes a packet up to userspace. This means that userspace can intelligently figure out what to do: - If userspace's notion of the flow for the packet matches the kernel's, then nothing special is necessary. - If the kernel has a more specific notion for the flow than userspace, for example if the kernel decoded IPv6 headers but userspace stopped at the Ethernet type (because it does not understand IPv6), then again nothing special is necessary: userspace can still set up the flow in the usual way. - If userspace has a more specific notion for the flow than the kernel, for example if userspace decoded an IPv6 header but the kernel stopped at the Ethernet type, then userspace can forward the packet manually, without setting up a flow in the kernel. (This case is bad from a performance point of view, but at least it is correct.) This commit does not actually make userspace flexible enough to handle changes in the kernel flow key structure, although userspace does now have enough information to do that intelligently. This will have to wait for later commits. This commit is bigger than it would otherwise be because it is rolled together with changing "struct odp_msg" to a sequence of Netlink attributes. The alternative, to do each of those changes in a separate patch, seemed like overkill because it meant that either we would have to introduce and then kill off Netlink attributes for in_port and tun_id, if Netlink conversion went first, or shove yet another variable-length header into the stuff already after odp_msg, if adding the flow key to odp_msg went first. This commit will slow down performance of checksumming packets sent up to userspace. I'm not entirely pleased with how I did it. I considered a couple of alternatives, but none of them seemed that much better. Suggestions welcome. Not changing anything wasn't an option, unfortunately. At any rate some slowdown will become unavoidable when OVS actually starts using Netlink instead of just Netlink framing. (Actually, I thought of one option where we could avoid that: make userspace do the checksum instead, by passing csum_start and csum_offset as part of what goes to userspace. But that's not perfect either.) Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-24 14:59:57 -08:00
*
* The caller owns the data of 'upcall->packet' and may modify it. If
* packet's headroom is exhausted as it is manipulated, 'upcall->packet'
* will be reallocated. This requires the data of 'upcall->packet' to be
* released with ofpbuf_uninit() before 'upcall' is destroyed. However,
* when an error is returned, the 'upcall->packet' may be uninitialized
* and should not be released.
*
* Returns 0 if successful, otherwise a positive errno value. Returns EAGAIN
datapath: Report kernel's flow key when passing packets up to userspace. One of the goals for Open vSwitch is to decouple kernel and userspace software, so that either one can be upgraded or rolled back independent of the other. To do this in full generality, it must be possible to change the kernel's idea of the flow key separately from the userspace version. This commit takes one step in that direction by making the kernel report its idea of the flow that a packet belongs to whenever it passes a packet up to userspace. This means that userspace can intelligently figure out what to do: - If userspace's notion of the flow for the packet matches the kernel's, then nothing special is necessary. - If the kernel has a more specific notion for the flow than userspace, for example if the kernel decoded IPv6 headers but userspace stopped at the Ethernet type (because it does not understand IPv6), then again nothing special is necessary: userspace can still set up the flow in the usual way. - If userspace has a more specific notion for the flow than the kernel, for example if userspace decoded an IPv6 header but the kernel stopped at the Ethernet type, then userspace can forward the packet manually, without setting up a flow in the kernel. (This case is bad from a performance point of view, but at least it is correct.) This commit does not actually make userspace flexible enough to handle changes in the kernel flow key structure, although userspace does now have enough information to do that intelligently. This will have to wait for later commits. This commit is bigger than it would otherwise be because it is rolled together with changing "struct odp_msg" to a sequence of Netlink attributes. The alternative, to do each of those changes in a separate patch, seemed like overkill because it meant that either we would have to introduce and then kill off Netlink attributes for in_port and tun_id, if Netlink conversion went first, or shove yet another variable-length header into the stuff already after odp_msg, if adding the flow key to odp_msg went first. This commit will slow down performance of checksumming packets sent up to userspace. I'm not entirely pleased with how I did it. I considered a couple of alternatives, but none of them seemed that much better. Suggestions welcome. Not changing anything wasn't an option, unfortunately. At any rate some slowdown will become unavoidable when OVS actually starts using Netlink instead of just Netlink framing. (Actually, I thought of one option where we could avoid that: make userspace do the checksum instead, by passing csum_start and csum_offset as part of what goes to userspace. But that's not perfect either.) Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jesse Gross <jesse@nicira.com>
2011-01-24 14:59:57 -08:00
* if no upcall is immediately available. */
int
dpif_recv(struct dpif *dpif, uint32_t handler_id, struct dpif_upcall *upcall,
struct ofpbuf *buf)
{
int error = EAGAIN;
if (dpif->dpif_class->recv) {
error = dpif->dpif_class->recv(dpif, handler_id, upcall, buf);
if (!error) {
OVS_USDT_PROBE(dpif_recv, recv_upcall, dpif->full_name,
upcall->type,
dp_packet_data(&upcall->packet),
dp_packet_size(&upcall->packet),
upcall->key, upcall->key_len);
dpif_print_packet(dpif, upcall);
} else if (error != EAGAIN) {
log_operation(dpif, "recv", error);
}
}
return error;
}
/* Discards all messages that would otherwise be received by dpif_recv() on
* 'dpif'. */
void
dpif_recv_purge(struct dpif *dpif)
{
COVERAGE_INC(dpif_purge);
if (dpif->dpif_class->recv_purge) {
dpif->dpif_class->recv_purge(dpif);
}
}
/* Arranges for the poll loop for an upcall handler to wake up when 'dpif'
* 'dpif' has a message queued to be received with the recv member
* function. Since there can be multiple poll loops, 'handler_id' is
* needed as index to identify the corresponding poll loop. */
void
dpif_recv_wait(struct dpif *dpif, uint32_t handler_id)
{
if (dpif->dpif_class->recv_wait) {
dpif->dpif_class->recv_wait(dpif, handler_id);
}
}
/*
* Return the datapath version. Caller is responsible for freeing
* the string.
*/
char *
dpif_get_dp_version(const struct dpif *dpif)
{
char *version = NULL;
if (dpif->dpif_class->get_datapath_version) {
version = dpif->dpif_class->get_datapath_version();
}
return version;
}
/* Obtains the NetFlow engine type and engine ID for 'dpif' into '*engine_type'
* and '*engine_id', respectively. */
void
dpif_get_netflow_ids(const struct dpif *dpif,
uint8_t *engine_type, uint8_t *engine_id)
{
*engine_type = dpif->netflow_engine_type;
*engine_id = dpif->netflow_engine_id;
}
/* Translates OpenFlow queue ID 'queue_id' (in host byte order) into a priority
* value used for setting packet priority.
* On success, returns 0 and stores the priority into '*priority'.
* On failure, returns a positive errno value and stores 0 into '*priority'. */
int
dpif_queue_to_priority(const struct dpif *dpif, uint32_t queue_id,
uint32_t *priority)
{
int error = (dpif->dpif_class->queue_to_priority
? dpif->dpif_class->queue_to_priority(dpif, queue_id,
priority)
: EOPNOTSUPP);
if (error) {
*priority = 0;
}
log_operation(dpif, "queue_to_priority", error);
return error;
}
void
dpif_init(struct dpif *dpif, const struct dpif_class *dpif_class,
const char *name,
uint8_t netflow_engine_type, uint8_t netflow_engine_id)
{
dpif->dpif_class = dpif_class;
dpif->base_name = xstrdup(name);
dpif->full_name = xasprintf("%s@%s", dpif_class->type, name);
dpif->netflow_engine_type = netflow_engine_type;
dpif->netflow_engine_id = netflow_engine_id;
}
/* Undoes the results of initialization.
*
* Normally this function only needs to be called from dpif_close().
* However, it may be called by providers due to an error on opening
* that occurs after initialization. It this case dpif_close() would
* never be called. */
void
dpif_uninit(struct dpif *dpif, bool close)
{
char *base_name = dpif->base_name;
char *full_name = dpif->full_name;
if (close) {
dpif->dpif_class->close(dpif);
}
free(base_name);
free(full_name);
}
static void
log_operation(const struct dpif *dpif, const char *operation, int error)
{
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: %s success", dpif_name(dpif), operation);
} else if (ofperr_is_valid(error)) {
VLOG_WARN_RL(&error_rl, "%s: %s failed (%s)",
dpif_name(dpif), operation, ofperr_get_name(error));
} else {
VLOG_WARN_RL(&error_rl, "%s: %s failed (%s)",
dpif_name(dpif), operation, ovs_strerror(error));
}
}
static enum vlog_level
flow_message_log_level(int error)
{
/* If flows arrive in a batch, userspace may push down multiple
* unique flow definitions that overlap when wildcards are applied.
* Kernels that support flow wildcarding will reject these flows as
* duplicates (EEXIST), so lower the log level to debug for these
* types of messages. */
return (error && error != EEXIST) ? VLL_WARN : VLL_DBG;
}
static bool
should_log_flow_message(const struct vlog_module *module, int error)
{
return !vlog_should_drop(module, flow_message_log_level(error),
error ? &error_rl : &dpmsg_rl);
}
void
log_flow_message(const struct dpif *dpif, int error,
const struct vlog_module *module,
const char *operation,
const struct nlattr *key, size_t key_len,
const struct nlattr *mask, size_t mask_len,
const ovs_u128 *ufid, const struct dpif_flow_stats *stats,
const struct nlattr *actions, size_t actions_len)
{
struct ds ds = DS_EMPTY_INITIALIZER;
ds_put_format(&ds, "%s: ", dpif_name(dpif));
if (error) {
ds_put_cstr(&ds, "failed to ");
}
ds_put_format(&ds, "%s ", operation);
if (error) {
ds_put_format(&ds, "(%s) ", ovs_strerror(error));
}
if (ufid) {
odp_format_ufid(ufid, &ds);
ds_put_cstr(&ds, " ");
}
odp_flow_format(key, key_len, mask, mask_len, NULL, &ds, true);
if (stats) {
ds_put_cstr(&ds, ", ");
dpif_flow_stats_format(stats, &ds);
}
if (actions || actions_len) {
ds_put_cstr(&ds, ", actions:");
format_odp_actions(&ds, actions, actions_len, NULL);
}
vlog(module, flow_message_log_level(error), "%s", ds_cstr(&ds));
ds_destroy(&ds);
}
void
log_flow_put_message(const struct dpif *dpif,
const struct vlog_module *module,
const struct dpif_flow_put *put,
int error)
{
if (should_log_flow_message(module, error)
&& !(put->flags & DPIF_FP_PROBE)) {
struct ds s;
ds_init(&s);
ds_put_cstr(&s, "put");
if (put->flags & DPIF_FP_CREATE) {
ds_put_cstr(&s, "[create]");
}
if (put->flags & DPIF_FP_MODIFY) {
ds_put_cstr(&s, "[modify]");
}
if (put->flags & DPIF_FP_ZERO_STATS) {
ds_put_cstr(&s, "[zero]");
}
log_flow_message(dpif, error, module, ds_cstr(&s),
put->key, put->key_len, put->mask, put->mask_len,
put->ufid, put->stats, put->actions,
put->actions_len);
ds_destroy(&s);
}
}
void
log_flow_del_message(const struct dpif *dpif,
const struct vlog_module *module,
const struct dpif_flow_del *del,
int error)
{
if (should_log_flow_message(module, error)) {
log_flow_message(dpif, error, module, "flow_del",
del->key, del->key_len,
NULL, 0, del->ufid, !error ? del->stats : NULL,
NULL, 0);
}
}
/* Logs that 'execute' was executed on 'dpif' and completed with errno 'error'
* (0 for success). 'subexecute' should be true if the execution is a result
* of breaking down a larger execution that needed help, false otherwise.
*
*
* XXX In theory, the log message could be deceptive because this function is
* called after the dpif_provider's '->execute' function, which is allowed to
* modify execute->packet and execute->md. In practice, though:
*
* - dpif-netlink doesn't modify execute->packet or execute->md.
*
* - dpif-netdev does modify them but it is less likely to have problems
* because it is built into ovs-vswitchd and cannot have version skew,
* etc.
*
* It would still be better to avoid the potential problem. I don't know of a
* good way to do that, though, that isn't expensive. */
void
log_execute_message(const struct dpif *dpif,
const struct vlog_module *module,
const struct dpif_execute *execute,
bool subexecute, int error)
{
if (!(error ? VLOG_DROP_WARN(&error_rl) : VLOG_DROP_DBG(&dpmsg_rl))
&& !execute->probe) {
struct ds ds = DS_EMPTY_INITIALIZER;
char *packet;
uint64_t stub[1024 / 8];
struct ofpbuf md = OFPBUF_STUB_INITIALIZER(stub);
packet = ofp_packet_to_string(dp_packet_data(execute->packet),
userspace: Add packet_type in dp_packet and flow This commit adds a packet_type attribute to the structs dp_packet and flow to explicitly carry the type of the packet as prepration for the introduction of the so-called packet type-aware pipeline (PTAP) in OVS. The packet_type is a big-endian 32 bit integer with the encoding as specified in OpenFlow verion 1.5. The upper 16 bits contain the packet type name space. Pre-defined values are defined in openflow-common.h: enum ofp_header_type_namespaces { OFPHTN_ONF = 0, /* ONF namespace. */ OFPHTN_ETHERTYPE = 1, /* ns_type is an Ethertype. */ OFPHTN_IP_PROTO = 2, /* ns_type is a IP protocol number. */ OFPHTN_UDP_TCP_PORT = 3, /* ns_type is a TCP or UDP port. */ OFPHTN_IPV4_OPTION = 4, /* ns_type is an IPv4 option number. */ }; The lower 16 bits specify the actual type in the context of the name space. Only name spaces 0 and 1 will be supported for now. For name space OFPHTN_ONF the relevant packet type is 0 (Ethernet). This is the default packet_type in OVS and the only one supported so far. Packets of type (OFPHTN_ONF, 0) are called Ethernet packets. In name space OFPHTN_ETHERTYPE the type is the Ethertype of the packet. A packet of type (OFPHTN_ETHERTYPE, <Ethertype>) is a standard L2 packet whith the Ethernet header (and any VLAN tags) removed to expose the L3 (or L2.5) payload of the packet. These will simply be called L3 packets. The Ethernet address fields dl_src and dl_dst in struct flow are not applicable for an L3 packet and must be zero. However, to maintain compatibility with the large code base, we have chosen to copy the Ethertype of an L3 packet into the the dl_type field of struct flow. This does not mean that it will be possible to match on dl_type for L3 packets with PTAP later on. Matching must be done on packet_type instead. New dp_packets are initialized with packet_type Ethernet. Ports that receive L3 packets will have to explicitly adjust the packet_type. Signed-off-by: Jean Tourrilhes <jt@labs.hpe.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-04-25 16:29:59 +00:00
dp_packet_size(execute->packet),
execute->packet->packet_type);
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
odp_key_from_dp_packet(&md, execute->packet);
ds_put_format(&ds, "%s: %sexecute ",
dpif_name(dpif),
(subexecute ? "sub-"
: dpif_execute_needs_help(execute) ? "super-"
: ""));
format_odp_actions(&ds, execute->actions, execute->actions_len, NULL);
if (error) {
ds_put_format(&ds, " failed (%s)", ovs_strerror(error));
}
ds_put_format(&ds, " on packet %s", packet);
ds_put_format(&ds, " with metadata ");
odp_flow_format(md.data, md.size, NULL, 0, NULL, &ds, true);
ds_put_format(&ds, " mtu %d", execute->mtu);
vlog(module, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds));
ds_destroy(&ds);
free(packet);
ofpbuf_uninit(&md);
}
}
void
log_flow_get_message(const struct dpif *dpif,
const struct vlog_module *module,
const struct dpif_flow_get *get,
int error)
{
if (should_log_flow_message(module, error)) {
log_flow_message(dpif, error, module, "flow_get",
get->key, get->key_len,
get->flow->mask, get->flow->mask_len,
get->ufid, &get->flow->stats,
get->flow->actions, get->flow->actions_len);
}
}
bool
dpif_supports_tnl_push_pop(const struct dpif *dpif)
{
return dpif_is_netdev(dpif);
}
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
bool
dpif_may_support_explicit_drop_action(const struct dpif *dpif)
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
{
/* TC does not support offloading this action. */
return dpif_is_netdev(dpif) || !netdev_is_flow_api_enabled();
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
}
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
bool
dpif_supports_lb_output_action(const struct dpif *dpif)
{
/*
* Balance-tcp optimization is currently supported in netdev
* datapath only.
*/
return dpif_is_netdev(dpif);
}
/* Meters */
void
dpif_meter_get_features(const struct dpif *dpif,
struct ofputil_meter_features *features)
{
memset(features, 0, sizeof *features);
if (dpif->dpif_class->meter_get_features) {
dpif->dpif_class->meter_get_features(dpif, features);
}
}
/* Adds or modifies the meter in 'dpif' with the given 'meter_id' and
* the configuration in 'config'.
*
* The meter id specified through 'config->meter_id' is ignored. */
int
dpif_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
struct ofputil_meter_config *config)
{
COVERAGE_INC(dpif_meter_set);
if (!(config->flags & (OFPMF13_KBPS | OFPMF13_PKTPS))) {
return EBADF; /* Rate unit type not set. */
}
if ((config->flags & OFPMF13_KBPS) && (config->flags & OFPMF13_PKTPS)) {
return EBADF; /* Both rate units may not be set. */
}
if (config->n_bands == 0) {
return EINVAL;
}
for (size_t i = 0; i < config->n_bands; i++) {
if (config->bands[i].rate == 0) {
return EDOM; /* Rate must be non-zero */
}
}
int error = dpif->dpif_class->meter_set(dpif, meter_id, config);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: DPIF meter %"PRIu32" set",
dpif_name(dpif), meter_id.uint32);
} else {
VLOG_WARN_RL(&error_rl, "%s: failed to set DPIF meter %"PRIu32": %s",
dpif_name(dpif), meter_id.uint32, ovs_strerror(error));
}
return error;
}
int
dpif_meter_get(const struct dpif *dpif, ofproto_meter_id meter_id,
struct ofputil_meter_stats *stats, uint16_t n_bands)
{
int error;
COVERAGE_INC(dpif_meter_get);
error = dpif->dpif_class->meter_get(dpif, meter_id, stats, n_bands);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: DPIF meter %"PRIu32" get stats",
dpif_name(dpif), meter_id.uint32);
} else {
VLOG_WARN_RL(&error_rl,
"%s: failed to get DPIF meter %"PRIu32" stats: %s",
dpif_name(dpif), meter_id.uint32, ovs_strerror(error));
stats->packet_in_count = ~0;
stats->byte_in_count = ~0;
stats->n_bands = 0;
}
return error;
}
int
dpif_meter_del(struct dpif *dpif, ofproto_meter_id meter_id,
struct ofputil_meter_stats *stats, uint16_t n_bands)
{
int error;
COVERAGE_INC(dpif_meter_del);
error = dpif->dpif_class->meter_del(dpif, meter_id, stats, n_bands);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: DPIF meter %"PRIu32" deleted",
dpif_name(dpif), meter_id.uint32);
} else {
VLOG_WARN_RL(&error_rl,
"%s: failed to delete DPIF meter %"PRIu32": %s",
dpif_name(dpif), meter_id.uint32, ovs_strerror(error));
if (stats) {
stats->packet_in_count = ~0;
stats->byte_in_count = ~0;
stats->n_bands = 0;
}
}
return error;
}
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
int
dpif_bond_add(struct dpif *dpif, uint32_t bond_id, odp_port_t *member_map)
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
{
return dpif->dpif_class->bond_add
? dpif->dpif_class->bond_add(dpif, bond_id, member_map)
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
: EOPNOTSUPP;
}
int
dpif_bond_del(struct dpif *dpif, uint32_t bond_id)
{
return dpif->dpif_class->bond_del
? dpif->dpif_class->bond_del(dpif, bond_id)
: EOPNOTSUPP;
}
int
dpif_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
uint64_t *n_bytes)
{
memset(n_bytes, 0, BOND_BUCKETS * sizeof *n_bytes);
return dpif->dpif_class->bond_stats_get
? dpif->dpif_class->bond_stats_get(dpif, bond_id, n_bytes)
: EOPNOTSUPP;
}
int
dpif_get_n_offloaded_flows(struct dpif *dpif, uint64_t *n_flows)
{
const char *dpif_type_str = dpif_normalize_type(dpif_type(dpif));
struct dpif_port_dump port_dump;
struct dpif_port dpif_port;
int ret, n_devs = 0;
uint64_t nflows;
*n_flows = 0;
DPIF_PORT_FOR_EACH (&dpif_port, &port_dump, dpif) {
ret = netdev_ports_get_n_flows(dpif_type_str, dpif_port.port_no,
&nflows);
if (!ret) {
*n_flows += nflows;
} else if (ret == EOPNOTSUPP) {
continue;
}
n_devs++;
}
return n_devs ? 0 : EOPNOTSUPP;
}
int
dpif_cache_get_supported_levels(struct dpif *dpif, uint32_t *levels)
{
return dpif->dpif_class->cache_get_supported_levels
? dpif->dpif_class->cache_get_supported_levels(dpif, levels)
: EOPNOTSUPP;
}
int
dpif_cache_get_name(struct dpif *dpif, uint32_t level, const char **name)
{
return dpif->dpif_class->cache_get_name
? dpif->dpif_class->cache_get_name(dpif, level, name)
: EOPNOTSUPP;
}
int
dpif_cache_get_size(struct dpif *dpif, uint32_t level, uint32_t *size)
{
return dpif->dpif_class->cache_get_size
? dpif->dpif_class->cache_get_size(dpif, level, size)
: EOPNOTSUPP;
}
int
dpif_cache_set_size(struct dpif *dpif, uint32_t level, uint32_t size)
{
return dpif->dpif_class->cache_set_size
? dpif->dpif_class->cache_set_size(dpif, level, size)
: EOPNOTSUPP;
}
bool
dpif_synced_dp_layers(struct dpif *dpif)
{
return dpif->dpif_class->synced_dp_layers;
}