2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00
ovs/ofproto/ofproto-dpif-xlate.c

8559 lines
297 KiB
C
Raw Normal View History

/* Copyright (c) 2009-2017, 2019-2020 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include <config.h>
#include "ofproto/ofproto-dpif-xlate.h"
#include <errno.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <sys/socket.h>
#include "bfd.h"
#include "bitmap.h"
#include "bond.h"
#include "bundle.h"
#include "byte-order.h"
#include "cfm.h"
#include "connmgr.h"
#include "coverage.h"
#include "csum.h"
#include "dp-packet.h"
#include "dpif.h"
#include "in-band.h"
#include "lacp.h"
#include "learn.h"
#include "mac-learning.h"
#include "mcast-snooping.h"
#include "multipath.h"
#include "netdev-vport.h"
#include "netlink.h"
#include "nx-match.h"
#include "odp-execute.h"
#include "ofproto/ofproto-dpif-ipfix.h"
#include "ofproto/ofproto-dpif-mirror.h"
#include "ofproto/ofproto-dpif-monitor.h"
#include "ofproto/ofproto-dpif-sflow.h"
#include "ofproto/ofproto-dpif-trace.h"
#include "ofproto/ofproto-dpif-xlate-cache.h"
#include "ofproto/ofproto-dpif.h"
#include "ofproto/ofproto-provider.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/meta-flow.h"
#include "openvswitch/list.h"
#include "openvswitch/ofp-actions.h"
#include "openvswitch/ofp-ed-props.h"
#include "openvswitch/vlog.h"
#include "ovs-lldp.h"
#include "ovs-router.h"
#include "packets.h"
#include "tnl-neigh-cache.h"
#include "tnl-ports.h"
#include "tunnel.h"
#include "util.h"
#include "uuid.h"
ofproto-dpif-xlate: Fix use-after-free when xlate_actions(). Currently, bundle->cvlans and xbundle->cvlans are pointing to the same memory location. This can cause issues if the main thread modifies bundle->cvlans and frees it while the revalidator thread is still accessing xbundle->cvlans. This leads to use-after-free error. AddressSanitizer: heap-use-after-free on address 0x615000007b08 at pc 0x0000004ede1e bp 0x7f3120ee0310 sp 0x7f3120ee0300 READ of size 8 at 0x615000007b08 thread T25 (revalidator25) 0 0x4ede1d in bitmap_is_set lib/bitmap.h:91 1 0x4fcb26 in xbundle_allows_cvlan ofproto/ofproto-dpif-xlate.c:2028 2 0x4fe279 in input_vid_is_valid ofproto/ofproto-dpif-xlate.c:2294 3 0x502abf in xlate_normal ofproto/ofproto-dpif-xlate.c:3051 4 0x5164dc in xlate_output_action ofproto/ofproto-dpif-xlate.c:5361 5 0x522576 in do_xlate_actions ofproto/ofproto-dpif-xlate.c:7047 6 0x52a751 in xlate_actions ofproto/ofproto-dpif-xlate.c:8061 7 0x4e2b66 in xlate_key ofproto/ofproto-dpif-upcall.c:2212 8 0x4e2e13 in xlate_ukey ofproto/ofproto-dpif-upcall.c:2227 9 0x4e345d in revalidate_ukey__ ofproto/ofproto-dpif-upcall.c:2276 10 0x4e3f85 in revalidate_ukey ofproto/ofproto-dpif-upcall.c:2395 11 0x4e7ac5 in revalidate ofproto/ofproto-dpif-upcall.c:2858 12 0x4d9ed3 in udpif_revalidator ofproto/ofproto-dpif-upcall.c:1010 13 0x7cd92e in ovsthread_wrapper lib/ovs-thread.c:423 14 0x7f312ff01f3a (/usr/lib64/libpthread.so.0+0x8f3a) 15 0x7f312fc8f51f in clone (/usr/lib64/libc.so.6+0xf851f) 0x615000007b08 is located 8 bytes inside of 512-byte region [0x615000007b00,0x615000007d00) freed by thread T0 here: 0 0x7f3130378ad8 in free (/usr/lib64/libasan.so.4+0xe0ad8) 1 0x49044e in bundle_set ofproto/ofproto-dpif.c:3431 2 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 3 0x40e6c9 in port_configure vswitchd/bridge.c:1300 4 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 5 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 6 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 7 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) previously allocated by thread T0 here: 0 0x7f3130378e70 in __interceptor_malloc 1 0x8757fe in xmalloc__ lib/util.c:140 2 0x8758da in xmalloc lib/util.c:175 3 0x875927 in xmemdup lib/util.c:188 4 0x475f63 in bitmap_clone lib/bitmap.h:79 5 0x47797c in vlan_bitmap_clone lib/vlan-bitmap.h:40 6 0x49048d in bundle_set ofproto/ofproto-dpif.c:3433 7 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 8 0x40e6c9 in port_configure vswitchd/bridge.c:1300 9 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 10 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 11 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 12 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) Fixes: fed8962aff57 ("Add new port VLAN mode "dot1q-tunnel"") Signed-off-by: Yunjian Wang <wangyunjian@huawei.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-05-06 18:00:09 +08:00
#include "vlan-bitmap.h"
COVERAGE_DEFINE(xlate_actions);
COVERAGE_DEFINE(xlate_actions_oversize);
COVERAGE_DEFINE(xlate_actions_too_many_output);
VLOG_DEFINE_THIS_MODULE(ofproto_dpif_xlate);
/* Maximum depth of flow table recursion (due to resubmit actions) in a
* flow translation.
*
* The goal of limiting the depth of resubmits is to ensure that flow
* translation eventually terminates. Only resubmits to the same table or an
* earlier table count against the maximum depth. This is because resubmits to
* strictly monotonically increasing table IDs will eventually terminate, since
* any OpenFlow switch has a finite number of tables. OpenFlow tables are most
* commonly traversed in numerically increasing order, so this limit has little
* effect on conventionally designed OpenFlow pipelines.
*
* Outputs to patch ports and to groups also count against the depth limit. */
#define MAX_DEPTH 64
/* Maximum number of resubmit actions in a flow translation, whether they are
* recursive or not. */
#define MAX_RESUBMITS (MAX_DEPTH * MAX_DEPTH)
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
/* The structure holds an array of IP addresses assigned to a bridge and the
* number of elements in the array. These data are mutable and are evaluated
* when ARP or Neighbor Advertisement packets received on a native tunnel
* port are xlated. So 'ref_cnt' and RCU are used for synchronization. */
struct xbridge_addr {
struct in6_addr *addr; /* Array of IP addresses of xbridge. */
int n_addr; /* Number of IP addresses. */
struct ovs_refcount ref_cnt;
};
struct xbridge {
struct hmap_node hmap_node; /* Node in global 'xbridges' map. */
struct ofproto_dpif *ofproto; /* Key in global 'xbridges' map. */
struct ovs_list xbundles; /* Owned xbundles. */
struct hmap xports; /* Indexed by ofp_port. */
char *name; /* Name used in log messages. */
struct dpif *dpif; /* Datapath interface. */
struct mac_learning *ml; /* Mac learning handle. */
struct mcast_snooping *ms; /* Multicast Snooping handle. */
struct mbridge *mbridge; /* Mirroring. */
struct dpif_sflow *sflow; /* SFlow handle, or null. */
struct dpif_ipfix *ipfix; /* Ipfix handle, or null. */
struct netflow *netflow; /* Netflow handle, or null. */
struct stp *stp; /* STP or null if disabled. */
struct rstp *rstp; /* RSTP or null if disabled. */
bool has_in_band; /* Bridge has in band control? */
bool forward_bpdu; /* Bridge forwards STP BPDUs? */
/* Datapath feature support. */
struct dpif_backer_support support;
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
struct xbridge_addr *addr;
};
struct xbundle {
struct hmap_node hmap_node; /* In global 'xbundles' map. */
struct ofbundle *ofbundle; /* Key in global 'xbundles' map. */
struct ovs_list list_node; /* In parent 'xbridges' list. */
struct xbridge *xbridge; /* Parent xbridge. */
struct ovs_list xports; /* Contains "struct xport"s. */
char *name; /* Name used in log messages. */
struct bond *bond; /* Nonnull iff more than one port. */
struct lacp *lacp; /* LACP handle or null. */
enum port_vlan_mode vlan_mode; /* VLAN mode. */
uint16_t qinq_ethtype; /* Ethertype of dot1q-tunnel interface
* either 0x8100 or 0x88a8. */
int vlan; /* -1=trunk port, else a 12-bit VLAN ID. */
unsigned long *trunks; /* Bitmap of trunked VLANs, if 'vlan' == -1.
* NULL if all VLANs are trunked. */
unsigned long *cvlans; /* Bitmap of allowed customer vlans,
* NULL if all VLANs are allowed */
enum port_priority_tags_mode use_priority_tags;
/* Use 802.1p tag for frames in VLAN 0? */
bool floodable; /* No port has OFPUTIL_PC_NO_FLOOD set? */
bool protected; /* Protected port mode */
};
struct xport {
struct hmap_node hmap_node; /* Node in global 'xports' map. */
struct ofport_dpif *ofport; /* Key in global 'xports map. */
struct hmap_node ofp_node; /* Node in parent xbridge 'xports' map. */
ofp_port_t ofp_port; /* Key in parent xbridge 'xports' map. */
struct hmap_node uuid_node; /* Node in global 'xports_uuid' map. */
struct uuid uuid; /* Key in global 'xports_uuid' map. */
odp_port_t odp_port; /* Datapath port number or ODPP_NONE. */
struct ovs_list bundle_node; /* In parent xbundle (if it exists). */
struct xbundle *xbundle; /* Parent xbundle or null. */
struct netdev *netdev; /* 'ofport''s netdev. */
struct xbridge *xbridge; /* Parent bridge. */
struct xport *peer; /* Patch port peer or null. */
enum ofputil_port_config config; /* OpenFlow port configuration. */
enum ofputil_port_state state; /* OpenFlow port state. */
int stp_port_no; /* STP port number or -1 if not in use. */
struct rstp_port *rstp_port; /* RSTP port or null. */
struct hmap skb_priorities; /* Map of 'skb_priority_to_dscp's. */
bool may_enable; /* May be enabled in bonds. */
bool is_tunnel; /* Is a tunnel port. */
enum netdev_pt_mode pt_mode; /* packet_type handling. */
struct cfm *cfm; /* CFM handle or null. */
struct bfd *bfd; /* BFD handle or null. */
struct lldp *lldp; /* LLDP handle or null. */
};
struct xlate_ctx {
struct xlate_in *xin;
struct xlate_out *xout;
struct xlate_cfg *xcfg;
const struct xbridge *xbridge;
/* Flow at the last commit. */
struct flow base_flow;
/* Tunnel IP destination address as received. This is stored separately
* as the base_flow.tunnel is cleared on init to reflect the datapath
* behavior. Used to make sure not to send tunneled output to ourselves,
* which might lead to an infinite loop. This could happen easily
* if a tunnel is marked as 'ip_remote=flow', and the flow does not
* actually set the tun_dst field. */
struct in6_addr orig_tunnel_ipv6_dst;
/* Stack for the push and pop actions. See comment above nx_stack_push()
* in nx-match.c for info on how the stack is stored. */
struct ofpbuf stack;
/* The rule that we are currently translating, or NULL. */
struct rule_dpif *rule;
/* Flow translation populates this with wildcards relevant in translation.
* When 'xin->wc' is nonnull, this is the same pointer. When 'xin->wc' is
* null, this is a pointer to a temporary buffer. */
struct flow_wildcards *wc;
/* Output buffer for datapath actions. When 'xin->odp_actions' is nonnull,
* this is the same pointer. When 'xin->odp_actions' is null, this points
* to a scratch ofpbuf. This allows code to add actions to
* 'ctx->odp_actions' without worrying about whether the caller really
* wants actions. */
struct ofpbuf *odp_actions;
/* Statistics maintained by xlate_table_action().
*
* These statistics limit the amount of work that a single flow
* translation can perform. The goal of the first of these, 'depth', is
* primarily to prevent translation from performing an infinite amount of
* work. It counts the current depth of nested "resubmit"s (and a few
* other activities); when a resubmit returns, it decreases. Resubmits to
* tables in strictly monotonically increasing order don't contribute to
* 'depth' because they cannot cause a flow translation to take an infinite
* amount of time (because the number of tables is finite). Translation
* aborts when 'depth' exceeds MAX_DEPTH.
*
* 'resubmits', on the other hand, prevents flow translation from
* performing an extraordinarily large while still finite amount of work.
* It counts the total number of resubmits (and a few other activities)
* that have been executed. Returning from a resubmit does not affect this
* counter. Thus, this limits the amount of work that a particular
* translation can perform. Translation aborts when 'resubmits' exceeds
* MAX_RESUBMITS (which is much larger than MAX_DEPTH).
*/
int depth; /* Current resubmit nesting depth. */
int resubmits; /* Total number of resubmits. */
bool in_action_set; /* Currently translating action_set, if true. */
bool in_packet_out; /* Currently translating a packet_out msg, if
* true. */
bool pending_encap; /* True when waiting to commit a pending
* encap action. */
bool pending_decap; /* True when waiting to commit a pending
* decap action. */
struct ofpbuf *encap_data; /* May contain a pointer to an ofpbuf with
* context for the datapath encap action.*/
uint8_t table_id; /* OpenFlow table ID where flow was found. */
ovs_be64 rule_cookie; /* Cookie of the rule being translated. */
uint32_t orig_skb_priority; /* Priority when packet arrived. */
uint32_t sflow_n_outputs; /* Number of output ports. */
odp_port_t sflow_odp_port; /* Output port for composing sFlow action. */
ofp_port_t nf_output_iface; /* Output interface index for NetFlow. */
bool exit; /* No further actions should be processed. */
mirror_mask_t mirrors; /* Bitmap of associated mirrors. */
int mirror_snaplen; /* Max size of a mirror packet in byte. */
/* Freezing Translation
* ====================
*
* At some point during translation, the code may recognize the need to halt
* and checkpoint the translation in a way that it can be restarted again
* later. We call the checkpointing process "freezing" and the restarting
* process "thawing".
*
* The use cases for freezing are:
*
* - "Recirculation", where the translation process discovers that it
* doesn't have enough information to complete translation without
* actually executing the actions that have already been translated,
* which provides the additionally needed information. In these
* situations, translation freezes translation and assigns the frozen
* data a unique "recirculation ID", which it associates with the data
* in a table in userspace (see ofproto-dpif-rid.h). It also adds a
* OVS_ACTION_ATTR_RECIRC action specifying that ID to the datapath
* actions. When a packet hits that action, the datapath looks its
* flow up again using the ID. If there's a miss, it comes back to
* userspace, which find the recirculation table entry for the ID,
* thaws the associated frozen data, and continues translation from
* that point given the additional information that is now known.
*
* The archetypal example is MPLS. As MPLS is implemented in
* OpenFlow, the protocol that follows the last MPLS label becomes
* known only when that label is popped by an OpenFlow action. That
* means that Open vSwitch can't extract the headers beyond the MPLS
* labels until the pop action is executed. Thus, at that point
* translation uses the recirculation process to extract the headers
* beyond the MPLS labels.
*
* (OVS also uses OVS_ACTION_ATTR_RECIRC to implement hashing for
* output to bonds. OVS pre-populates all the datapath flows for bond
* output in the datapath, though, which means that the elaborate
* process of coming back to userspace for a second round of
* translation isn't needed, and so bonds don't follow the above
* process.)
*
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
* - "Continuation". A continuation is a way for an OpenFlow controller
* to interpose on a packet's traversal of the OpenFlow tables. When
* the translation process encounters a "controller" action with the
* "pause" flag, it freezes translation, serializes the frozen data,
* and sends it to an OpenFlow controller. The controller then
* examines and possibly modifies the frozen data and eventually sends
* it back to the switch, which thaws it and continues translation.
*
* The main problem of freezing translation is preserving state, so that
* when the translation is thawed later it resumes from where it left off,
* without disruption. In particular, actions must be preserved as follows:
*
* - If we're freezing because an action needed more information, the
* action that prompted it.
*
* - Any actions remaining to be translated within the current flow.
*
* - If translation was frozen within a NXAST_RESUBMIT, then any actions
* following the resubmit action. Resubmit actions can be nested, so
* this has to go all the way up the control stack.
*
* - The OpenFlow 1.1+ action set.
*
* State that actions and flow table lookups can depend on, such as the
* following, must also be preserved:
*
* - Metadata fields (input port, registers, OF1.1+ metadata, ...).
*
* - The stack used by NXAST_STACK_PUSH and NXAST_STACK_POP actions.
*
* - The table ID and cookie of the flow being translated at each level
* of the control stack, because these can become visible through
* OFPAT_CONTROLLER actions (and other ways).
*
* Translation allows for the control of this state preservation via these
* members. When a need to freeze translation is identified, the
* translation process:
*
* 1. Sets 'freezing' to true.
*
* 2. Sets 'exit' to true to tell later steps that we're exiting from the
* translation process.
*
* 3. Adds an OFPACT_UNROLL_XLATE action to 'frozen_actions', and points
* frozen_actions.header to the action to make it easy to find it later.
* This action holds the current table ID and cookie so that they can be
* restored during a post-recirculation upcall translation.
*
* 4. Adds the action that prompted recirculation and any actions following
* it within the same flow to 'frozen_actions', so that they can be
* executed during a post-recirculation upcall translation.
*
* 5. Returns.
*
* 6. The action that prompted recirculation might be nested in a stack of
* nested "resubmit"s that have actions remaining. Each of these notices
* that we're exiting and freezing and responds by adding more
* OFPACT_UNROLL_XLATE actions to 'frozen_actions', as necessary,
* followed by any actions that were yet unprocessed.
*
* If we're freezing because of recirculation, the caller generates a
* recirculation ID and associates all the state produced by this process
* with it. For post-recirculation upcall translation, the caller passes it
* back in for the new translation to execute. The process yielded a set of
* ofpacts that can be translated directly, so it is not much of a special
* case at that point.
*/
bool freezing;
bool recirc_update_dp_hash; /* Generated recirculation will be preceded
* by datapath HASH action to get an updated
* dp_hash after recirculation. */
uint32_t dp_hash_alg;
uint32_t dp_hash_basis;
struct ofpbuf frozen_actions;
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
const struct ofpact_controller *pause;
/* True if a packet was but is no longer MPLS (due to an MPLS pop action).
* This is a trigger for recirculation in cases where translating an action
* or looking up a flow requires access to the fields of the packet after
* the MPLS label stack that was originally present. */
bool was_mpls;
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
/* True if conntrack has been performed on this packet during processing
* on the current bridge. This is used to determine whether conntrack
* state from the datapath should be honored after thawing. */
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
bool conntracked;
/* Pointer to an embedded NAT action in a conntrack action, or NULL. */
struct ofpact_nat *ct_nat_action;
/* OpenFlow 1.1+ action set.
*
* 'action_set' accumulates "struct ofpact"s added by OFPACT_WRITE_ACTIONS.
* When translation is otherwise complete, ofpacts_execute_action_set()
* converts it to a set of "struct ofpact"s that can be translated into
* datapath actions. */
bool action_set_has_group; /* Action set contains OFPACT_GROUP? */
struct ofpbuf action_set; /* Action set. */
enum xlate_error error; /* Translation failed. */
};
/* Structure to track VLAN manipulation */
struct xvlan_single {
uint16_t tpid;
uint16_t vid;
uint16_t pcp;
};
struct xvlan {
struct xvlan_single v[FLOW_MAX_VLAN_HEADERS];
};
const char *xlate_strerror(enum xlate_error error)
{
switch (error) {
case XLATE_OK:
return "OK";
case XLATE_BRIDGE_NOT_FOUND:
return "Bridge not found";
case XLATE_RECURSION_TOO_DEEP:
return "Recursion too deep";
case XLATE_TOO_MANY_RESUBMITS:
return "Too many resubmits";
case XLATE_STACK_TOO_DEEP:
return "Stack too deep";
case XLATE_NO_RECIRCULATION_CONTEXT:
return "No recirculation context";
case XLATE_RECIRCULATION_CONFLICT:
return "Recirculation conflict";
case XLATE_TOO_MANY_MPLS_LABELS:
return "Too many MPLS labels";
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
case XLATE_INVALID_TUNNEL_METADATA:
return "Invalid tunnel metadata";
case XLATE_UNSUPPORTED_PACKET_TYPE:
return "Unsupported packet type";
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
case XLATE_CONGESTION_DROP:
return "Congestion Drop";
case XLATE_FORWARDING_DISABLED:
return "Forwarding is disabled";
case XLATE_MAX:
break;
}
return "Unknown error";
}
static void xlate_action_set(struct xlate_ctx *ctx);
static void xlate_commit_actions(struct xlate_ctx *ctx);
static void
patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev,
struct xport *out_dev, bool is_last_action);
static void
ctx_trigger_freeze(struct xlate_ctx *ctx)
{
ctx->exit = true;
ctx->freezing = true;
}
static void
ctx_trigger_recirculate_with_hash(struct xlate_ctx *ctx, uint32_t type,
uint32_t basis)
{
ctx->exit = true;
ctx->freezing = true;
ctx->recirc_update_dp_hash = true;
ctx->dp_hash_alg = type;
ctx->dp_hash_basis = basis;
}
static bool
ctx_first_frozen_action(const struct xlate_ctx *ctx)
{
return !ctx->frozen_actions.size;
}
static void
ctx_cancel_freeze(struct xlate_ctx *ctx)
{
if (ctx->freezing) {
ctx->freezing = false;
ctx->recirc_update_dp_hash = false;
ofpbuf_clear(&ctx->frozen_actions);
ctx->frozen_actions.header = NULL;
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
ctx->pause = NULL;
}
}
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
static void finish_freezing(struct xlate_ctx *ctx);
/* These functions and structure are used to save stack space in actions that
* need to retain a large amount of xlate_ctx state. */
struct xretained_state {
union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)];
uint64_t actset_stub[1024 / 8];
struct ofpbuf old_stack;
struct ofpbuf old_action_set;
struct flow old_flow;
struct flow old_base;
struct flow_tnl flow_tnl_mask;
};
/* The return of this function must be freed by
* xretain_state_restore_and_free(). */
static struct xretained_state *
xretain_state_save(struct xlate_ctx *ctx)
{
struct xretained_state *retained = xmalloc(sizeof *retained);
retained->old_flow = ctx->xin->flow;
retained->old_stack = ctx->stack;
retained->old_action_set = ctx->action_set;
ofpbuf_use_stub(&ctx->stack, retained->new_stack,
sizeof retained->new_stack);
ofpbuf_use_stub(&ctx->action_set, retained->actset_stub,
sizeof retained->actset_stub);
return retained;
}
static void
xretain_tunnel_mask_save(const struct xlate_ctx *ctx,
struct xretained_state *retained)
{
retained->flow_tnl_mask = ctx->wc->masks.tunnel;
}
static void
xretain_base_flow_save(const struct xlate_ctx *ctx,
struct xretained_state *retained)
{
retained->old_base = ctx->base_flow;
}
static void
xretain_base_flow_restore(struct xlate_ctx *ctx,
const struct xretained_state *retained)
{
ctx->base_flow = retained->old_base;
}
static void
xretain_flow_restore(struct xlate_ctx *ctx,
const struct xretained_state *retained)
{
ctx->xin->flow = retained->old_flow;
}
static void
xretain_tunnel_mask_restore(struct xlate_ctx *ctx,
const struct xretained_state *retained)
{
ctx->wc->masks.tunnel = retained->flow_tnl_mask;
}
static void
xretain_state_restore_and_free(struct xlate_ctx *ctx,
struct xretained_state *retained)
{
ctx->xin->flow = retained->old_flow;
ofpbuf_uninit(&ctx->action_set);
ctx->action_set = retained->old_action_set;
ofpbuf_uninit(&ctx->stack);
ctx->stack = retained->old_stack;
free(retained);
}
/* A controller may use OFPP_NONE as the ingress port to indicate that
* it did not arrive on a "real" port. 'ofpp_none_bundle' exists for
* when an input bundle is needed for validation (e.g., mirroring or
* OFPP_NORMAL processing). It is not connected to an 'ofproto' or have
* any 'port' structs, so care must be taken when dealing with it. */
static struct xbundle ofpp_none_bundle = {
.name = "OFPP_NONE",
.vlan_mode = PORT_VLAN_TRUNK
};
/* Node in 'xport''s 'skb_priorities' map. Used to maintain a map from
* 'priority' (the datapath's term for QoS queue) to the dscp bits which all
* traffic egressing the 'ofport' with that priority should be marked with. */
struct skb_priority_to_dscp {
struct hmap_node hmap_node; /* Node in 'ofport_dpif''s 'skb_priorities'. */
uint32_t skb_priority; /* Priority of this queue (see struct flow). */
uint8_t dscp; /* DSCP bits to mark outgoing traffic with. */
};
/* Xlate config contains hash maps of all bridges, bundles and ports.
* Xcfgp contains the pointer to the current xlate configuration.
* When the main thread needs to change the configuration, it copies xcfgp to
* new_xcfg and edits new_xcfg. This enables the use of RCU locking which
* does not block handler and revalidator threads. */
struct xlate_cfg {
struct hmap xbridges;
struct hmap xbundles;
struct hmap xports;
struct hmap xports_uuid;
};
static OVSRCU_TYPE(struct xlate_cfg *) xcfgp = OVSRCU_INITIALIZER(NULL);
static struct xlate_cfg *new_xcfg = NULL;
typedef void xlate_actions_handler(const struct ofpact *, size_t ofpacts_len,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
struct xlate_ctx *, bool, bool);
static bool may_receive(const struct xport *, struct xlate_ctx *);
static void do_xlate_actions(const struct ofpact *, size_t ofpacts_len,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
struct xlate_ctx *, bool, bool);
static void clone_xlate_actions(const struct ofpact *, size_t ofpacts_len,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
struct xlate_ctx *, bool, bool);
static void xlate_normal(struct xlate_ctx *);
static void xlate_normal_flood(struct xlate_ctx *ct,
struct xbundle *in_xbundle, struct xvlan *);
static void xlate_table_action(struct xlate_ctx *, ofp_port_t in_port,
uint8_t table_id, bool may_packet_in,
bool honor_table_miss, bool with_ct_orig,
bool is_last_action, xlate_actions_handler *);
static bool input_vid_is_valid(const struct xlate_ctx *,
uint16_t vid, struct xbundle *);
static void xvlan_copy(struct xvlan *dst, const struct xvlan *src);
static void xvlan_pop(struct xvlan *src);
static void xvlan_push_uninit(struct xvlan *src);
static void xvlan_extract(const struct flow *, struct xvlan *);
static void xvlan_put(struct flow *, const struct xvlan *,
enum port_priority_tags_mode);
static void xvlan_input_translate(const struct xbundle *,
const struct xvlan *in,
struct xvlan *xvlan);
static void xvlan_output_translate(const struct xbundle *,
const struct xvlan *xvlan,
struct xvlan *out);
static void output_normal(struct xlate_ctx *, const struct xbundle *,
const struct xvlan *);
/* Optional bond recirculation parameter to compose_output_action(). */
struct xlate_bond_recirc {
uint32_t recirc_id; /* !0 Use recirculation instead of output. */
uint8_t hash_alg; /* !0 Compute hash for recirc before. */
uint32_t hash_basis; /* Compute hash for recirc before. */
};
static void compose_output_action(struct xlate_ctx *, ofp_port_t ofp_port,
const struct xlate_bond_recirc *xr,
bool is_last_action, bool truncate);
static struct xbridge *xbridge_lookup(struct xlate_cfg *,
const struct ofproto_dpif *);
static struct xbridge *xbridge_lookup_by_uuid(struct xlate_cfg *,
const struct uuid *);
static struct xbundle *xbundle_lookup(struct xlate_cfg *,
const struct ofbundle *);
static struct xport *xport_lookup(struct xlate_cfg *,
const struct ofport_dpif *);
static struct xport *xport_lookup_by_uuid(struct xlate_cfg *,
const struct uuid *);
static struct xport *get_ofp_port(const struct xbridge *, ofp_port_t ofp_port);
static struct skb_priority_to_dscp *get_skb_priority(const struct xport *,
uint32_t skb_priority);
static void clear_skb_priorities(struct xport *);
static size_t count_skb_priorities(const struct xport *);
static bool dscp_from_skb_priority(const struct xport *, uint32_t skb_priority,
uint8_t *dscp);
static void xlate_xbridge_init(struct xlate_cfg *, struct xbridge *);
static void xlate_xbundle_init(struct xlate_cfg *, struct xbundle *);
static void xlate_xport_init(struct xlate_cfg *, struct xport *);
static void xlate_xbridge_set(struct xbridge *, struct dpif *,
const struct mac_learning *, struct stp *,
struct rstp *, const struct mcast_snooping *,
const struct mbridge *,
const struct dpif_sflow *,
const struct dpif_ipfix *,
const struct netflow *,
bool forward_bpdu, bool has_in_band,
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
const struct dpif_backer_support *,
const struct xbridge_addr *);
static void xlate_xbundle_set(struct xbundle *xbundle,
enum port_vlan_mode vlan_mode,
uint16_t qinq_ethtype, int vlan,
unsigned long *trunks, unsigned long *cvlans,
enum port_priority_tags_mode,
const struct bond *bond, const struct lacp *lacp,
bool floodable, bool protected);
static void xlate_xport_set(struct xport *xport, odp_port_t odp_port,
const struct netdev *netdev, const struct cfm *cfm,
const struct bfd *bfd, const struct lldp *lldp,
int stp_port_no, const struct rstp_port *rstp_port,
enum ofputil_port_config config,
enum ofputil_port_state state, bool is_tunnel,
bool may_enable);
static void xlate_xbridge_remove(struct xlate_cfg *, struct xbridge *);
static void xlate_xbundle_remove(struct xlate_cfg *, struct xbundle *);
static void xlate_xport_remove(struct xlate_cfg *, struct xport *);
static void xlate_xbridge_copy(struct xbridge *);
static void xlate_xbundle_copy(struct xbridge *, struct xbundle *);
static void xlate_xport_copy(struct xbridge *, struct xbundle *,
struct xport *);
static void xlate_xcfg_free(struct xlate_cfg *);
/* Tracing helpers. */
/* If tracing is enabled in 'ctx', creates a new trace node and appends it to
* the list of nodes maintained in ctx->xin. The new node has type 'type' and
* its text is created from 'format' by treating it as a printf format string.
* Returns the list of nodes embedded within the new trace node; ordinarily,
* the calleer can ignore this, but it is useful if the caller needs to nest
* more trace nodes within the new node.
*
* If tracing is not enabled, does nothing and returns NULL. */
static struct ovs_list * OVS_PRINTF_FORMAT(3, 4)
xlate_report(const struct xlate_ctx *ctx, enum oftrace_node_type type,
const char *format, ...)
{
struct ovs_list *subtrace = NULL;
if (OVS_UNLIKELY(ctx->xin->trace)) {
va_list args;
va_start(args, format);
char *text = xvasprintf(format, args);
subtrace = &oftrace_report(ctx->xin->trace, type, text)->subs;
va_end(args);
free(text);
}
return subtrace;
}
/* This is like xlate_report() for errors that are serious enough that we
* should log them even if we are not tracing. */
static void OVS_PRINTF_FORMAT(2, 3)
xlate_report_error(const struct xlate_ctx *ctx, const char *format, ...)
{
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
if (!OVS_UNLIKELY(ctx->xin->trace)
&& (!ctx->xin->packet || VLOG_DROP_WARN(&rl))) {
return;
}
struct ds s = DS_EMPTY_INITIALIZER;
va_list args;
va_start(args, format);
ds_put_format_valist(&s, format, args);
va_end(args);
if (ctx->xin->trace) {
oftrace_report(ctx->xin->trace, OFT_ERROR, ds_cstr(&s));
} else {
ds_put_format(&s, " on bridge %s while processing ",
ctx->xbridge->name);
flow_format(&s, &ctx->base_flow, NULL);
VLOG_WARN("%s", ds_cstr(&s));
}
ds_destroy(&s);
}
/* This is like xlate_report() for messages that should be logged
at the info level (even when not tracing). */
static void OVS_PRINTF_FORMAT(2, 3)
xlate_report_info(const struct xlate_ctx *ctx, const char *format, ...)
{
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
if (!OVS_UNLIKELY(ctx->xin->trace)
&& (!ctx->xin->packet || VLOG_DROP_INFO(&rl))) {
return;
}
struct ds s = DS_EMPTY_INITIALIZER;
va_list args;
va_start(args, format);
ds_put_format_valist(&s, format, args);
va_end(args);
if (ctx->xin->trace) {
oftrace_report(ctx->xin->trace, OFT_WARN, ds_cstr(&s));
} else {
ds_put_format(&s, " on bridge %s while processing ",
ctx->xbridge->name);
flow_format(&s, &ctx->base_flow, NULL);
VLOG_INFO("%s", ds_cstr(&s));
}
ds_destroy(&s);
}
/* This is like xlate_report() for messages that should be logged at debug
* level (even if we are not tracing) because they can be valuable for
* debugging. */
static void OVS_PRINTF_FORMAT(3, 4)
xlate_report_debug(const struct xlate_ctx *ctx, enum oftrace_node_type type,
const char *format, ...)
{
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300);
if (!OVS_UNLIKELY(ctx->xin->trace)
&& (!ctx->xin->packet || VLOG_DROP_DBG(&rl))) {
return;
}
struct ds s = DS_EMPTY_INITIALIZER;
va_list args;
va_start(args, format);
ds_put_format_valist(&s, format, args);
va_end(args);
if (ctx->xin->trace) {
oftrace_report(ctx->xin->trace, type, ds_cstr(&s));
} else {
VLOG_DBG("bridge %s: %s", ctx->xbridge->name, ds_cstr(&s));
}
ds_destroy(&s);
}
/* If tracing is enabled in 'ctx', appends a node of the given 'type' to the
* trace, whose text is 'title' followed by a formatted version of the
* 'ofpacts_len' OpenFlow actions in 'ofpacts'.
*
* If tracing is not enabled, does nothing. */
static void
xlate_report_actions(const struct xlate_ctx *ctx, enum oftrace_node_type type,
const char *title,
const struct ofpact *ofpacts, size_t ofpacts_len)
{
if (OVS_UNLIKELY(ctx->xin->trace)) {
struct ds s = DS_EMPTY_INITIALIZER;
ds_put_format(&s, "%s: ", title);
struct ofpact_format_params fp = { .s = &s };
ofpacts_format(ofpacts, ofpacts_len, &fp);
oftrace_report(ctx->xin->trace, type, ds_cstr(&s));
ds_destroy(&s);
}
}
/* If tracing is enabled in 'ctx', appends a node of type OFT_DETAIL to the
* trace, whose the message is a formatted version of the OpenFlow action set.
* 'verb' should be "was" or "is", depending on whether the action set reported
* is the new action set or the old one.
*
* If tracing is not enabled, does nothing. */
static void
xlate_report_action_set(const struct xlate_ctx *ctx, const char *verb)
{
if (OVS_UNLIKELY(ctx->xin->trace)) {
struct ofpbuf action_list;
ofpbuf_init(&action_list, 0);
ofpacts_execute_action_set(&action_list, &ctx->action_set);
if (action_list.size) {
struct ds s = DS_EMPTY_INITIALIZER;
struct ofpact_format_params fp = { .s = &s };
ofpacts_format(action_list.data, action_list.size, &fp);
xlate_report(ctx, OFT_DETAIL, "action set %s: %s",
verb, ds_cstr(&s));
ds_destroy(&s);
} else {
xlate_report(ctx, OFT_DETAIL, "action set %s empty", verb);
}
ofpbuf_uninit(&action_list);
}
}
/* If tracing is enabled in 'ctx', appends a node representing 'rule' (in
* OpenFlow table 'table_id') to the trace and makes this node the parent for
* future trace nodes. The caller should save ctx->xin->trace before calling
* this function, then after tracing all of the activities under the table,
* restore its previous value.
*
* If tracing is not enabled, does nothing. */
static void
xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule,
uint8_t table_id)
{
if (OVS_LIKELY(!ctx->xin->trace)) {
return;
}
struct ds s = DS_EMPTY_INITIALIZER;
ds_put_format(&s, "%2d. ", table_id);
if (rule == ctx->xin->ofproto->miss_rule) {
ds_put_cstr(&s, "No match, and a \"packet-in\" is called for.");
} else if (rule == ctx->xin->ofproto->no_packet_in_rule) {
ds_put_cstr(&s, "No match.");
} else if (rule == ctx->xin->ofproto->drop_frags_rule) {
ds_put_cstr(&s, "Packets are IP fragments and "
"the fragment handling mode is \"drop\".");
} else {
struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map);
if (ctx->xin->names) {
struct ofproto_dpif *ofprotop;
ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name);
ofproto_append_ports_to_map(&map, ofprotop->up.ports);
}
minimatch_format(&rule->up.cr.match,
ofproto_get_tun_tab(&ctx->xin->ofproto->up),
&map, &s, OFP_DEFAULT_PRIORITY);
ofputil_port_map_destroy(&map);
if (ds_last(&s) != ' ') {
ds_put_cstr(&s, ", ");
}
ds_put_format(&s, "priority %d", rule->up.cr.priority);
if (rule->up.flow_cookie) {
ds_put_format(&s, ", cookie %#"PRIx64,
ntohll(rule->up.flow_cookie));
}
}
ctx->xin->trace = &oftrace_report(ctx->xin->trace, OFT_TABLE,
ds_cstr(&s))->subs;
ds_destroy(&s);
}
/* If tracing is enabled in 'ctx', adds an OFT_DETAIL trace node to 'ctx'
* reporting the value of subfield 'sf'.
*
* If tracing is not enabled, does nothing. */
static void
xlate_report_subfield(const struct xlate_ctx *ctx,
const struct mf_subfield *sf)
{
if (OVS_UNLIKELY(ctx->xin->trace)) {
struct ds s = DS_EMPTY_INITIALIZER;
mf_format_subfield(sf, &s);
ds_put_cstr(&s, " is now ");
if (sf->ofs == 0 && sf->n_bits >= sf->field->n_bits) {
union mf_value value;
mf_get_value(sf->field, &ctx->xin->flow, &value);
mf_format(sf->field, &value, NULL, NULL, &s);
} else {
union mf_subvalue cst;
mf_read_subfield(sf, &ctx->xin->flow, &cst);
ds_put_hex(&s, &cst, sizeof cst);
}
xlate_report(ctx, OFT_DETAIL, "%s", ds_cstr(&s));
ds_destroy(&s);
}
}
static void
xlate_xbridge_init(struct xlate_cfg *xcfg, struct xbridge *xbridge)
{
ovs_list_init(&xbridge->xbundles);
hmap_init(&xbridge->xports);
hmap_insert(&xcfg->xbridges, &xbridge->hmap_node,
ofproto: Use xlate map for uuid lookups. The ofproto map 'all_ofproto_dpifs_by_uuid' does not support concurrent accesses. It is however read by upcall handler threads and written by the main thread at the same time. Additionally, handler threads will change the ams_seq while an ofproto is being destroyed, triggering crashes with the following backtrace: (gdb) bt hmap_next (hmap.h:398) seq_wake_waiters (seq.c:326) seq_change_protected (seq.c:134) seq_change (seq.c:144) ofproto_dpif_send_async_msg (ofproto_dpif.c:263) process_upcall (ofproto_dpif_upcall.c:1782) recv_upcalls (ofproto_dpif_upcall.c:1026) udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945) ovsthread_wrapper (ovs_thread.c:734) To solve both issues, remove the 'all_ofproto_dpifs_by_uuid'. Instead, another map already storing ofprotos in xlate can be used. During an ofproto destruction, its reference is removed from the current xlate xcfg. Such change is committed only after all threads have quiesced at least once during xlate_txn_commit(). This wait ensures that the removal is seen by all threads, rendering impossible for a thread to still hold a reference while the destruction proceeds. Furthermore, the xlate maps are copied during updates instead of being written in place. It is thus correct to read xcfg->xbridges while inserting or removing from new_xcfg->xbridges. Finally, now that ofproto_dpifs lookups are done through xcfg->xbridges, it is important to use a high level of entropy. As it used the ofproto pointer hashed, fewer bits were random compared to the uuid key used in 'all_ofproto_dpifs_by_uuid'. To solve this, use the ofproto uuid as the key in xbridges as well, improving entropy. Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user action cookie.") Suggested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Signed-off-by: Gaetan Rivet <grive@u256.net> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com> Co-authored-by: Yunjian Wang <wangyunjian@huawei.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-02-23 19:48:12 +01:00
uuid_hash(&xbridge->ofproto->uuid));
}
static void
xlate_xbundle_init(struct xlate_cfg *xcfg, struct xbundle *xbundle)
{
ovs_list_init(&xbundle->xports);
ovs_list_insert(&xbundle->xbridge->xbundles, &xbundle->list_node);
hmap_insert(&xcfg->xbundles, &xbundle->hmap_node,
hash_pointer(xbundle->ofbundle, 0));
}
static void
xlate_xport_init(struct xlate_cfg *xcfg, struct xport *xport)
{
hmap_init(&xport->skb_priorities);
hmap_insert(&xcfg->xports, &xport->hmap_node,
hash_pointer(xport->ofport, 0));
hmap_insert(&xport->xbridge->xports, &xport->ofp_node,
hash_ofp_port(xport->ofp_port));
hmap_insert(&xcfg->xports_uuid, &xport->uuid_node,
uuid_hash(&xport->uuid));
}
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
static struct xbridge_addr *
xbridge_addr_create(struct xbridge *xbridge)
{
struct xbridge_addr *xbridge_addr = xbridge->addr;
struct in6_addr *addr = NULL, *mask = NULL;
struct netdev *dev;
int err, n_addr = 0;
err = netdev_open(xbridge->name, NULL, &dev);
if (!err) {
err = netdev_get_addr_list(dev, &addr, &mask, &n_addr);
if (!err) {
if (!xbridge->addr ||
n_addr != xbridge->addr->n_addr ||
(xbridge->addr->addr && memcmp(addr, xbridge->addr->addr,
sizeof(*addr) * n_addr))) {
xbridge_addr = xzalloc(sizeof *xbridge_addr);
xbridge_addr->addr = addr;
xbridge_addr->n_addr = n_addr;
ovs_refcount_init(&xbridge_addr->ref_cnt);
} else {
free(addr);
}
free(mask);
}
netdev_close(dev);
}
return xbridge_addr;
}
static struct xbridge_addr *
xbridge_addr_ref(const struct xbridge_addr *addr_)
{
struct xbridge_addr *addr = CONST_CAST(struct xbridge_addr *, addr_);
if (addr) {
ovs_refcount_ref(&addr->ref_cnt);
}
return addr;
}
static void
xbridge_addr_unref(struct xbridge_addr *addr)
{
if (addr && ovs_refcount_unref_relaxed(&addr->ref_cnt) == 1) {
free(addr->addr);
free(addr);
}
}
static void
xlate_xbridge_set(struct xbridge *xbridge,
struct dpif *dpif,
const struct mac_learning *ml, struct stp *stp,
struct rstp *rstp, const struct mcast_snooping *ms,
const struct mbridge *mbridge,
const struct dpif_sflow *sflow,
const struct dpif_ipfix *ipfix,
const struct netflow *netflow,
bool forward_bpdu, bool has_in_band,
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
const struct dpif_backer_support *support,
const struct xbridge_addr *addr)
{
if (xbridge->ml != ml) {
mac_learning_unref(xbridge->ml);
xbridge->ml = mac_learning_ref(ml);
}
if (xbridge->ms != ms) {
mcast_snooping_unref(xbridge->ms);
xbridge->ms = mcast_snooping_ref(ms);
}
if (xbridge->mbridge != mbridge) {
mbridge_unref(xbridge->mbridge);
xbridge->mbridge = mbridge_ref(mbridge);
}
if (xbridge->sflow != sflow) {
dpif_sflow_unref(xbridge->sflow);
xbridge->sflow = dpif_sflow_ref(sflow);
}
if (xbridge->ipfix != ipfix) {
dpif_ipfix_unref(xbridge->ipfix);
xbridge->ipfix = dpif_ipfix_ref(ipfix);
}
if (xbridge->stp != stp) {
stp_unref(xbridge->stp);
xbridge->stp = stp_ref(stp);
}
if (xbridge->rstp != rstp) {
rstp_unref(xbridge->rstp);
xbridge->rstp = rstp_ref(rstp);
}
if (xbridge->netflow != netflow) {
netflow_unref(xbridge->netflow);
xbridge->netflow = netflow_ref(netflow);
}
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
if (xbridge->addr != addr) {
xbridge_addr_unref(xbridge->addr);
xbridge->addr = xbridge_addr_ref(addr);
}
xbridge->dpif = dpif;
xbridge->forward_bpdu = forward_bpdu;
xbridge->has_in_band = has_in_band;
xbridge->support = *support;
}
static void
xlate_xbundle_set(struct xbundle *xbundle,
enum port_vlan_mode vlan_mode, uint16_t qinq_ethtype,
int vlan, unsigned long *trunks, unsigned long *cvlans,
enum port_priority_tags_mode use_priority_tags,
const struct bond *bond, const struct lacp *lacp,
bool floodable, bool protected)
{
ovs_assert(xbundle->xbridge);
xbundle->vlan_mode = vlan_mode;
xbundle->qinq_ethtype = qinq_ethtype;
xbundle->vlan = vlan;
xbundle->trunks = trunks;
ofproto-dpif-xlate: Fix use-after-free when xlate_actions(). Currently, bundle->cvlans and xbundle->cvlans are pointing to the same memory location. This can cause issues if the main thread modifies bundle->cvlans and frees it while the revalidator thread is still accessing xbundle->cvlans. This leads to use-after-free error. AddressSanitizer: heap-use-after-free on address 0x615000007b08 at pc 0x0000004ede1e bp 0x7f3120ee0310 sp 0x7f3120ee0300 READ of size 8 at 0x615000007b08 thread T25 (revalidator25) 0 0x4ede1d in bitmap_is_set lib/bitmap.h:91 1 0x4fcb26 in xbundle_allows_cvlan ofproto/ofproto-dpif-xlate.c:2028 2 0x4fe279 in input_vid_is_valid ofproto/ofproto-dpif-xlate.c:2294 3 0x502abf in xlate_normal ofproto/ofproto-dpif-xlate.c:3051 4 0x5164dc in xlate_output_action ofproto/ofproto-dpif-xlate.c:5361 5 0x522576 in do_xlate_actions ofproto/ofproto-dpif-xlate.c:7047 6 0x52a751 in xlate_actions ofproto/ofproto-dpif-xlate.c:8061 7 0x4e2b66 in xlate_key ofproto/ofproto-dpif-upcall.c:2212 8 0x4e2e13 in xlate_ukey ofproto/ofproto-dpif-upcall.c:2227 9 0x4e345d in revalidate_ukey__ ofproto/ofproto-dpif-upcall.c:2276 10 0x4e3f85 in revalidate_ukey ofproto/ofproto-dpif-upcall.c:2395 11 0x4e7ac5 in revalidate ofproto/ofproto-dpif-upcall.c:2858 12 0x4d9ed3 in udpif_revalidator ofproto/ofproto-dpif-upcall.c:1010 13 0x7cd92e in ovsthread_wrapper lib/ovs-thread.c:423 14 0x7f312ff01f3a (/usr/lib64/libpthread.so.0+0x8f3a) 15 0x7f312fc8f51f in clone (/usr/lib64/libc.so.6+0xf851f) 0x615000007b08 is located 8 bytes inside of 512-byte region [0x615000007b00,0x615000007d00) freed by thread T0 here: 0 0x7f3130378ad8 in free (/usr/lib64/libasan.so.4+0xe0ad8) 1 0x49044e in bundle_set ofproto/ofproto-dpif.c:3431 2 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 3 0x40e6c9 in port_configure vswitchd/bridge.c:1300 4 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 5 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 6 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 7 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) previously allocated by thread T0 here: 0 0x7f3130378e70 in __interceptor_malloc 1 0x8757fe in xmalloc__ lib/util.c:140 2 0x8758da in xmalloc lib/util.c:175 3 0x875927 in xmemdup lib/util.c:188 4 0x475f63 in bitmap_clone lib/bitmap.h:79 5 0x47797c in vlan_bitmap_clone lib/vlan-bitmap.h:40 6 0x49048d in bundle_set ofproto/ofproto-dpif.c:3433 7 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 8 0x40e6c9 in port_configure vswitchd/bridge.c:1300 9 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 10 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 11 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 12 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) Fixes: fed8962aff57 ("Add new port VLAN mode "dot1q-tunnel"") Signed-off-by: Yunjian Wang <wangyunjian@huawei.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-05-06 18:00:09 +08:00
if (!vlan_bitmap_equal(xbundle->cvlans, cvlans)) {
free(xbundle->cvlans);
xbundle->cvlans = vlan_bitmap_clone(cvlans);
}
xbundle->use_priority_tags = use_priority_tags;
xbundle->floodable = floodable;
xbundle->protected = protected;
if (xbundle->bond != bond) {
bond_unref(xbundle->bond);
xbundle->bond = bond_ref(bond);
}
if (xbundle->lacp != lacp) {
lacp_unref(xbundle->lacp);
xbundle->lacp = lacp_ref(lacp);
}
}
static void
xlate_xport_set(struct xport *xport, odp_port_t odp_port,
const struct netdev *netdev, const struct cfm *cfm,
const struct bfd *bfd, const struct lldp *lldp, int stp_port_no,
const struct rstp_port* rstp_port,
enum ofputil_port_config config, enum ofputil_port_state state,
bool is_tunnel, bool may_enable)
{
xport->config = config;
xport->state = state;
xport->stp_port_no = stp_port_no;
xport->is_tunnel = is_tunnel;
xport->pt_mode = netdev_get_pt_mode(netdev);
xport->may_enable = may_enable;
xport->odp_port = odp_port;
if (xport->rstp_port != rstp_port) {
rstp_port_unref(xport->rstp_port);
xport->rstp_port = rstp_port_ref(rstp_port);
}
if (xport->cfm != cfm) {
cfm_unref(xport->cfm);
xport->cfm = cfm_ref(cfm);
}
if (xport->bfd != bfd) {
bfd_unref(xport->bfd);
xport->bfd = bfd_ref(bfd);
}
if (xport->lldp != lldp) {
lldp_unref(xport->lldp);
xport->lldp = lldp_ref(lldp);
}
if (xport->netdev != netdev) {
netdev_close(xport->netdev);
xport->netdev = netdev_ref(netdev);
}
}
static void
xlate_xbridge_copy(struct xbridge *xbridge)
{
struct xbundle *xbundle;
struct xport *xport;
struct xbridge *new_xbridge = xzalloc(sizeof *xbridge);
new_xbridge->ofproto = xbridge->ofproto;
new_xbridge->name = xstrdup(xbridge->name);
xlate_xbridge_init(new_xcfg, new_xbridge);
xlate_xbridge_set(new_xbridge,
xbridge->dpif, xbridge->ml, xbridge->stp,
xbridge->rstp, xbridge->ms, xbridge->mbridge,
xbridge->sflow, xbridge->ipfix, xbridge->netflow,
xbridge->forward_bpdu, xbridge->has_in_band,
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
&xbridge->support, xbridge->addr);
LIST_FOR_EACH (xbundle, list_node, &xbridge->xbundles) {
xlate_xbundle_copy(new_xbridge, xbundle);
}
/* Copy xports which are not part of a xbundle */
HMAP_FOR_EACH (xport, ofp_node, &xbridge->xports) {
if (!xport->xbundle) {
xlate_xport_copy(new_xbridge, NULL, xport);
}
}
}
static void
xlate_xbundle_copy(struct xbridge *xbridge, struct xbundle *xbundle)
{
struct xport *xport;
struct xbundle *new_xbundle = xzalloc(sizeof *xbundle);
new_xbundle->ofbundle = xbundle->ofbundle;
new_xbundle->xbridge = xbridge;
new_xbundle->name = xstrdup(xbundle->name);
xlate_xbundle_init(new_xcfg, new_xbundle);
xlate_xbundle_set(new_xbundle, xbundle->vlan_mode, xbundle->qinq_ethtype,
xbundle->vlan, xbundle->trunks, xbundle->cvlans,
xbundle->use_priority_tags, xbundle->bond, xbundle->lacp,
xbundle->floodable, xbundle->protected);
LIST_FOR_EACH (xport, bundle_node, &xbundle->xports) {
xlate_xport_copy(xbridge, new_xbundle, xport);
}
}
static void
xlate_xport_copy(struct xbridge *xbridge, struct xbundle *xbundle,
struct xport *xport)
{
struct skb_priority_to_dscp *pdscp, *new_pdscp;
struct xport *new_xport = xzalloc(sizeof *xport);
new_xport->ofport = xport->ofport;
new_xport->ofp_port = xport->ofp_port;
new_xport->xbridge = xbridge;
new_xport->uuid = xport->uuid;
xlate_xport_init(new_xcfg, new_xport);
xlate_xport_set(new_xport, xport->odp_port, xport->netdev, xport->cfm,
xport->bfd, xport->lldp, xport->stp_port_no,
xport->rstp_port, xport->config, xport->state,
xport->is_tunnel, xport->may_enable);
if (xport->peer) {
struct xport *peer = xport_lookup(new_xcfg, xport->peer->ofport);
if (peer) {
new_xport->peer = peer;
new_xport->peer->peer = new_xport;
}
}
if (xbundle) {
new_xport->xbundle = xbundle;
ovs_list_insert(&new_xport->xbundle->xports, &new_xport->bundle_node);
}
HMAP_FOR_EACH (pdscp, hmap_node, &xport->skb_priorities) {
new_pdscp = xmalloc(sizeof *pdscp);
new_pdscp->skb_priority = pdscp->skb_priority;
new_pdscp->dscp = pdscp->dscp;
hmap_insert(&new_xport->skb_priorities, &new_pdscp->hmap_node,
hash_int(new_pdscp->skb_priority, 0));
}
}
/* Sets the current xlate configuration to new_xcfg and frees the old xlate
* configuration in xcfgp.
*
* This needs to be called after editing the xlate configuration.
*
* Functions that edit the new xlate configuration are
* xlate_<ofproto/bundle/ofport>_set and xlate_<ofproto/bundle/ofport>_remove.
*
* A sample workflow:
*
* xlate_txn_start();
* ...
* edit_xlate_configuration();
* ...
* xlate_txn_commit();
*
* The ovsrcu_synchronize() call here also ensures that the upcall threads
* retain no references to anything in the previous configuration.
*/
void
xlate_txn_commit(void)
{
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
ovsrcu_set(&xcfgp, new_xcfg);
ovsrcu_synchronize();
xlate_xcfg_free(xcfg);
new_xcfg = NULL;
}
/* Copies the current xlate configuration in xcfgp to new_xcfg.
*
* This needs to be called prior to editing the xlate configuration. */
void
xlate_txn_start(void)
{
struct xbridge *xbridge;
struct xlate_cfg *xcfg;
ovs_assert(!new_xcfg);
new_xcfg = xmalloc(sizeof *new_xcfg);
hmap_init(&new_xcfg->xbridges);
hmap_init(&new_xcfg->xbundles);
hmap_init(&new_xcfg->xports);
hmap_init(&new_xcfg->xports_uuid);
xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
if (!xcfg) {
return;
}
HMAP_FOR_EACH (xbridge, hmap_node, &xcfg->xbridges) {
xlate_xbridge_copy(xbridge);
}
}
static void
xlate_xcfg_free(struct xlate_cfg *xcfg)
{
struct xbridge *xbridge;
if (!xcfg) {
return;
}
HMAP_FOR_EACH_SAFE (xbridge, hmap_node, &xcfg->xbridges) {
xlate_xbridge_remove(xcfg, xbridge);
}
hmap_destroy(&xcfg->xbridges);
hmap_destroy(&xcfg->xbundles);
hmap_destroy(&xcfg->xports);
hmap_destroy(&xcfg->xports_uuid);
free(xcfg);
}
void
xlate_ofproto_set(struct ofproto_dpif *ofproto, const char *name,
struct dpif *dpif,
const struct mac_learning *ml, struct stp *stp,
struct rstp *rstp, const struct mcast_snooping *ms,
const struct mbridge *mbridge,
const struct dpif_sflow *sflow,
const struct dpif_ipfix *ipfix,
const struct netflow *netflow,
bool forward_bpdu, bool has_in_band,
const struct dpif_backer_support *support)
{
struct xbridge *xbridge;
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
struct xbridge_addr *xbridge_addr, *old_addr;
ovs_assert(new_xcfg);
xbridge = xbridge_lookup(new_xcfg, ofproto);
if (!xbridge) {
xbridge = xzalloc(sizeof *xbridge);
xbridge->ofproto = ofproto;
xlate_xbridge_init(new_xcfg, xbridge);
}
free(xbridge->name);
xbridge->name = xstrdup(name);
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
xbridge_addr = xbridge_addr_create(xbridge);
old_addr = xbridge->addr;
xlate_xbridge_set(xbridge, dpif, ml, stp, rstp, ms, mbridge, sflow, ipfix,
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
netflow, forward_bpdu, has_in_band, support,
xbridge_addr);
if (xbridge_addr != old_addr) {
xbridge_addr_unref(xbridge_addr);
}
}
static void
xlate_xbridge_remove(struct xlate_cfg *xcfg, struct xbridge *xbridge)
{
struct xbundle *xbundle;
struct xport *xport;
if (!xbridge) {
return;
}
HMAP_FOR_EACH_SAFE (xport, ofp_node, &xbridge->xports) {
xlate_xport_remove(xcfg, xport);
}
LIST_FOR_EACH_SAFE (xbundle, list_node, &xbridge->xbundles) {
xlate_xbundle_remove(xcfg, xbundle);
}
hmap_remove(&xcfg->xbridges, &xbridge->hmap_node);
mac_learning_unref(xbridge->ml);
mcast_snooping_unref(xbridge->ms);
mbridge_unref(xbridge->mbridge);
dpif_sflow_unref(xbridge->sflow);
dpif_ipfix_unref(xbridge->ipfix);
netflow_unref(xbridge->netflow);
stp_unref(xbridge->stp);
rstp_unref(xbridge->rstp);
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
xbridge_addr_unref(xbridge->addr);
hmap_destroy(&xbridge->xports);
free(xbridge->name);
free(xbridge);
}
void
xlate_remove_ofproto(struct ofproto_dpif *ofproto)
{
struct xbridge *xbridge;
ovs_assert(new_xcfg);
xbridge = xbridge_lookup(new_xcfg, ofproto);
xlate_xbridge_remove(new_xcfg, xbridge);
}
void
xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle,
const char *name, enum port_vlan_mode vlan_mode,
uint16_t qinq_ethtype, int vlan,
unsigned long *trunks, unsigned long *cvlans,
enum port_priority_tags_mode use_priority_tags,
const struct bond *bond, const struct lacp *lacp,
bool floodable, bool protected)
{
struct xbundle *xbundle;
ovs_assert(new_xcfg);
xbundle = xbundle_lookup(new_xcfg, ofbundle);
if (!xbundle) {
xbundle = xzalloc(sizeof *xbundle);
xbundle->ofbundle = ofbundle;
xbundle->xbridge = xbridge_lookup(new_xcfg, ofproto);
xlate_xbundle_init(new_xcfg, xbundle);
}
free(xbundle->name);
xbundle->name = xstrdup(name);
xlate_xbundle_set(xbundle, vlan_mode, qinq_ethtype, vlan, trunks, cvlans,
use_priority_tags, bond, lacp, floodable, protected);
}
static void
xlate_xbundle_remove(struct xlate_cfg *xcfg, struct xbundle *xbundle)
{
struct xport *xport;
if (!xbundle) {
return;
}
LIST_FOR_EACH_POP (xport, bundle_node, &xbundle->xports) {
xport->xbundle = NULL;
}
hmap_remove(&xcfg->xbundles, &xbundle->hmap_node);
ovs_list_remove(&xbundle->list_node);
bond_unref(xbundle->bond);
lacp_unref(xbundle->lacp);
ofproto-dpif-xlate: Fix use-after-free when xlate_actions(). Currently, bundle->cvlans and xbundle->cvlans are pointing to the same memory location. This can cause issues if the main thread modifies bundle->cvlans and frees it while the revalidator thread is still accessing xbundle->cvlans. This leads to use-after-free error. AddressSanitizer: heap-use-after-free on address 0x615000007b08 at pc 0x0000004ede1e bp 0x7f3120ee0310 sp 0x7f3120ee0300 READ of size 8 at 0x615000007b08 thread T25 (revalidator25) 0 0x4ede1d in bitmap_is_set lib/bitmap.h:91 1 0x4fcb26 in xbundle_allows_cvlan ofproto/ofproto-dpif-xlate.c:2028 2 0x4fe279 in input_vid_is_valid ofproto/ofproto-dpif-xlate.c:2294 3 0x502abf in xlate_normal ofproto/ofproto-dpif-xlate.c:3051 4 0x5164dc in xlate_output_action ofproto/ofproto-dpif-xlate.c:5361 5 0x522576 in do_xlate_actions ofproto/ofproto-dpif-xlate.c:7047 6 0x52a751 in xlate_actions ofproto/ofproto-dpif-xlate.c:8061 7 0x4e2b66 in xlate_key ofproto/ofproto-dpif-upcall.c:2212 8 0x4e2e13 in xlate_ukey ofproto/ofproto-dpif-upcall.c:2227 9 0x4e345d in revalidate_ukey__ ofproto/ofproto-dpif-upcall.c:2276 10 0x4e3f85 in revalidate_ukey ofproto/ofproto-dpif-upcall.c:2395 11 0x4e7ac5 in revalidate ofproto/ofproto-dpif-upcall.c:2858 12 0x4d9ed3 in udpif_revalidator ofproto/ofproto-dpif-upcall.c:1010 13 0x7cd92e in ovsthread_wrapper lib/ovs-thread.c:423 14 0x7f312ff01f3a (/usr/lib64/libpthread.so.0+0x8f3a) 15 0x7f312fc8f51f in clone (/usr/lib64/libc.so.6+0xf851f) 0x615000007b08 is located 8 bytes inside of 512-byte region [0x615000007b00,0x615000007d00) freed by thread T0 here: 0 0x7f3130378ad8 in free (/usr/lib64/libasan.so.4+0xe0ad8) 1 0x49044e in bundle_set ofproto/ofproto-dpif.c:3431 2 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 3 0x40e6c9 in port_configure vswitchd/bridge.c:1300 4 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 5 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 6 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 7 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) previously allocated by thread T0 here: 0 0x7f3130378e70 in __interceptor_malloc 1 0x8757fe in xmalloc__ lib/util.c:140 2 0x8758da in xmalloc lib/util.c:175 3 0x875927 in xmemdup lib/util.c:188 4 0x475f63 in bitmap_clone lib/bitmap.h:79 5 0x47797c in vlan_bitmap_clone lib/vlan-bitmap.h:40 6 0x49048d in bundle_set ofproto/ofproto-dpif.c:3433 7 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 8 0x40e6c9 in port_configure vswitchd/bridge.c:1300 9 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 10 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 11 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 12 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) Fixes: fed8962aff57 ("Add new port VLAN mode "dot1q-tunnel"") Signed-off-by: Yunjian Wang <wangyunjian@huawei.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-05-06 18:00:09 +08:00
free(xbundle->cvlans);
free(xbundle->name);
free(xbundle);
}
void
xlate_bundle_remove(struct ofbundle *ofbundle)
{
struct xbundle *xbundle;
ovs_assert(new_xcfg);
xbundle = xbundle_lookup(new_xcfg, ofbundle);
xlate_xbundle_remove(new_xcfg, xbundle);
}
void
xlate_ofport_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle,
struct ofport_dpif *ofport, ofp_port_t ofp_port,
odp_port_t odp_port, const struct netdev *netdev,
const struct cfm *cfm, const struct bfd *bfd,
const struct lldp *lldp, struct ofport_dpif *peer,
int stp_port_no, const struct rstp_port *rstp_port,
const struct ofproto_port_queue *qdscp_list, size_t n_qdscp,
enum ofputil_port_config config,
enum ofputil_port_state state, bool is_tunnel,
bool may_enable)
{
size_t i;
struct xport *xport;
ovs_assert(new_xcfg);
xport = xport_lookup(new_xcfg, ofport);
if (!xport) {
xport = xzalloc(sizeof *xport);
xport->ofport = ofport;
xport->xbridge = xbridge_lookup(new_xcfg, ofproto);
xport->ofp_port = ofp_port;
uuid_generate(&xport->uuid);
xlate_xport_init(new_xcfg, xport);
}
ovs_assert(xport->ofp_port == ofp_port);
xlate_xport_set(xport, odp_port, netdev, cfm, bfd, lldp,
stp_port_no, rstp_port, config, state, is_tunnel,
may_enable);
if (xport->peer) {
xport->peer->peer = NULL;
}
xport->peer = xport_lookup(new_xcfg, peer);
if (xport->peer) {
xport->peer->peer = xport;
}
if (xport->xbundle) {
ovs_list_remove(&xport->bundle_node);
}
xport->xbundle = xbundle_lookup(new_xcfg, ofbundle);
if (xport->xbundle) {
ovs_list_insert(&xport->xbundle->xports, &xport->bundle_node);
}
clear_skb_priorities(xport);
for (i = 0; i < n_qdscp; i++) {
struct skb_priority_to_dscp *pdscp;
uint32_t skb_priority;
if (dpif_queue_to_priority(xport->xbridge->dpif, qdscp_list[i].queue,
&skb_priority)) {
continue;
}
pdscp = xmalloc(sizeof *pdscp);
pdscp->skb_priority = skb_priority;
pdscp->dscp = (qdscp_list[i].dscp << 2) & IP_DSCP_MASK;
hmap_insert(&xport->skb_priorities, &pdscp->hmap_node,
hash_int(pdscp->skb_priority, 0));
}
}
static void
xlate_xport_remove(struct xlate_cfg *xcfg, struct xport *xport)
{
if (!xport) {
return;
}
if (xport->peer) {
xport->peer->peer = NULL;
xport->peer = NULL;
}
if (xport->xbundle) {
ovs_list_remove(&xport->bundle_node);
}
clear_skb_priorities(xport);
hmap_destroy(&xport->skb_priorities);
hmap_remove(&xcfg->xports, &xport->hmap_node);
hmap_remove(&xcfg->xports_uuid, &xport->uuid_node);
hmap_remove(&xport->xbridge->xports, &xport->ofp_node);
netdev_close(xport->netdev);
rstp_port_unref(xport->rstp_port);
cfm_unref(xport->cfm);
bfd_unref(xport->bfd);
lldp_unref(xport->lldp);
free(xport);
}
void
xlate_ofport_remove(struct ofport_dpif *ofport)
{
struct xport *xport;
ovs_assert(new_xcfg);
xport = xport_lookup(new_xcfg, ofport);
if (xport) {
tnl_neigh_flush(netdev_get_name(xport->netdev));
}
xlate_xport_remove(new_xcfg, xport);
}
static struct ofproto_dpif *
xlate_lookup_ofproto_(const struct dpif_backer *backer,
const struct flow *flow,
ofp_port_t *ofp_in_port, const struct xport **xportp,
char **errorp)
{
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
const struct xport *xport;
/* If packet is recirculated, xport can be retrieved from frozen state. */
if (flow->recirc_id) {
const struct recirc_id_node *recirc_id_node;
recirc_id_node = recirc_id_node_find(flow->recirc_id);
if (OVS_UNLIKELY(!recirc_id_node)) {
if (errorp) {
*errorp = xasprintf("no recirculation data for recirc_id "
"%#"PRIx32, flow->recirc_id);
}
return NULL;
}
ofp_port_t in_port = recirc_id_node->state.metadata.in_port;
if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) {
struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
xport = xport_lookup_by_uuid(xcfg, &xport_uuid);
if (xport && xport->xbridge && xport->xbridge->ofproto) {
goto out;
}
} else {
/* OFPP_NONE and OFPP_CONTROLLER are not real ports. They indicate
* that the packet originated from the controller via an OpenFlow
* "packet-out". The right thing to do is to find just the
* ofproto. There is no xport, which is OK.
*
* OFPP_NONE can also indicate that a bond caused recirculation. */
struct uuid uuid = recirc_id_node->state.ofproto_uuid;
const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, &uuid);
if (bridge && bridge->ofproto) {
if (errorp) {
*errorp = NULL;
}
*xportp = NULL;
if (ofp_in_port) {
*ofp_in_port = in_port;
}
return bridge->ofproto;
}
}
}
xport = xport_lookup(xcfg, tnl_port_should_receive(flow)
? tnl_port_receive(flow)
: odp_port_to_ofport(backer, flow->in_port.odp_port));
if (OVS_UNLIKELY(!xport)) {
if (errorp) {
*errorp = (tnl_port_should_receive(flow)
? xstrdup("no OpenFlow tunnel port for this packet")
: xasprintf("no OpenFlow port for datapath port "
"%"PRIu32, flow->in_port.odp_port));
}
return NULL;
}
out:
if (errorp) {
*errorp = NULL;
}
*xportp = xport;
if (ofp_in_port) {
*ofp_in_port = xport->ofp_port;
}
return xport->xbridge->ofproto;
}
/* Given a datapath and flow metadata ('backer', and 'flow' respectively)
* returns the corresponding struct ofproto_dpif and OpenFlow port number. */
struct ofproto_dpif *
xlate_lookup_ofproto(const struct dpif_backer *backer, const struct flow *flow,
ofp_port_t *ofp_in_port, char **errorp)
{
const struct xport *xport;
return xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport, errorp);
}
/* Given a datapath and flow metadata ('backer', and 'flow' respectively),
* optionally populates 'ofprotop' with the ofproto_dpif, 'ofp_in_port' with the
* openflow in_port, and 'ipfix', 'sflow', and 'netflow' with the appropriate
* handles for those protocols if they're enabled. Caller may use the returned
* pointers until quiescing, for longer term use additional references must
* be taken.
*
* Returns 0 if successful, ENODEV if the parsed flow has no associated ofproto.
* Sets an extended error string to 'errorp'. Callers are responsible for
* freeing that string.
*/
int
xlate_lookup(const struct dpif_backer *backer, const struct flow *flow,
struct ofproto_dpif **ofprotop, struct dpif_ipfix **ipfix,
struct dpif_sflow **sflow, struct netflow **netflow,
ofp_port_t *ofp_in_port, char **errorp)
{
struct ofproto_dpif *ofproto;
const struct xport *xport;
ofproto = xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport, errorp);
if (!ofproto) {
return ENODEV;
}
if (ofprotop) {
*ofprotop = ofproto;
}
if (ipfix) {
*ipfix = xport ? xport->xbridge->ipfix : NULL;
}
if (sflow) {
*sflow = xport ? xport->xbridge->sflow : NULL;
}
if (netflow) {
*netflow = xport ? xport->xbridge->netflow : NULL;
}
return 0;
}
static struct xbridge *
xbridge_lookup(struct xlate_cfg *xcfg, const struct ofproto_dpif *ofproto)
{
struct hmap *xbridges;
struct xbridge *xbridge;
if (!ofproto || !xcfg) {
return NULL;
}
xbridges = &xcfg->xbridges;
ofproto: Use xlate map for uuid lookups. The ofproto map 'all_ofproto_dpifs_by_uuid' does not support concurrent accesses. It is however read by upcall handler threads and written by the main thread at the same time. Additionally, handler threads will change the ams_seq while an ofproto is being destroyed, triggering crashes with the following backtrace: (gdb) bt hmap_next (hmap.h:398) seq_wake_waiters (seq.c:326) seq_change_protected (seq.c:134) seq_change (seq.c:144) ofproto_dpif_send_async_msg (ofproto_dpif.c:263) process_upcall (ofproto_dpif_upcall.c:1782) recv_upcalls (ofproto_dpif_upcall.c:1026) udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945) ovsthread_wrapper (ovs_thread.c:734) To solve both issues, remove the 'all_ofproto_dpifs_by_uuid'. Instead, another map already storing ofprotos in xlate can be used. During an ofproto destruction, its reference is removed from the current xlate xcfg. Such change is committed only after all threads have quiesced at least once during xlate_txn_commit(). This wait ensures that the removal is seen by all threads, rendering impossible for a thread to still hold a reference while the destruction proceeds. Furthermore, the xlate maps are copied during updates instead of being written in place. It is thus correct to read xcfg->xbridges while inserting or removing from new_xcfg->xbridges. Finally, now that ofproto_dpifs lookups are done through xcfg->xbridges, it is important to use a high level of entropy. As it used the ofproto pointer hashed, fewer bits were random compared to the uuid key used in 'all_ofproto_dpifs_by_uuid'. To solve this, use the ofproto uuid as the key in xbridges as well, improving entropy. Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user action cookie.") Suggested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Signed-off-by: Gaetan Rivet <grive@u256.net> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com> Co-authored-by: Yunjian Wang <wangyunjian@huawei.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-02-23 19:48:12 +01:00
HMAP_FOR_EACH_IN_BUCKET (xbridge, hmap_node, uuid_hash(&ofproto->uuid),
xbridges) {
if (xbridge->ofproto == ofproto) {
return xbridge;
}
}
return NULL;
}
static struct xbridge *
xbridge_lookup_by_uuid(struct xlate_cfg *xcfg, const struct uuid *uuid)
{
struct xbridge *xbridge;
HMAP_FOR_EACH (xbridge, hmap_node, &xcfg->xbridges) {
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
if (uuid_equals(&xbridge->ofproto->uuid, uuid)) {
return xbridge;
}
}
return NULL;
}
ofproto: Use xlate map for uuid lookups. The ofproto map 'all_ofproto_dpifs_by_uuid' does not support concurrent accesses. It is however read by upcall handler threads and written by the main thread at the same time. Additionally, handler threads will change the ams_seq while an ofproto is being destroyed, triggering crashes with the following backtrace: (gdb) bt hmap_next (hmap.h:398) seq_wake_waiters (seq.c:326) seq_change_protected (seq.c:134) seq_change (seq.c:144) ofproto_dpif_send_async_msg (ofproto_dpif.c:263) process_upcall (ofproto_dpif_upcall.c:1782) recv_upcalls (ofproto_dpif_upcall.c:1026) udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945) ovsthread_wrapper (ovs_thread.c:734) To solve both issues, remove the 'all_ofproto_dpifs_by_uuid'. Instead, another map already storing ofprotos in xlate can be used. During an ofproto destruction, its reference is removed from the current xlate xcfg. Such change is committed only after all threads have quiesced at least once during xlate_txn_commit(). This wait ensures that the removal is seen by all threads, rendering impossible for a thread to still hold a reference while the destruction proceeds. Furthermore, the xlate maps are copied during updates instead of being written in place. It is thus correct to read xcfg->xbridges while inserting or removing from new_xcfg->xbridges. Finally, now that ofproto_dpifs lookups are done through xcfg->xbridges, it is important to use a high level of entropy. As it used the ofproto pointer hashed, fewer bits were random compared to the uuid key used in 'all_ofproto_dpifs_by_uuid'. To solve this, use the ofproto uuid as the key in xbridges as well, improving entropy. Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user action cookie.") Suggested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Signed-off-by: Gaetan Rivet <grive@u256.net> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com> Co-authored-by: Yunjian Wang <wangyunjian@huawei.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-02-23 19:48:12 +01:00
struct ofproto_dpif *
xlate_ofproto_lookup(const struct uuid *uuid)
{
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
struct xbridge *xbridge;
if (!xcfg) {
return NULL;
}
xbridge = xbridge_lookup_by_uuid(xcfg, uuid);
if (xbridge != NULL) {
return xbridge->ofproto;
}
return NULL;
}
static struct xbundle *
xbundle_lookup(struct xlate_cfg *xcfg, const struct ofbundle *ofbundle)
{
struct hmap *xbundles;
struct xbundle *xbundle;
if (!ofbundle || !xcfg) {
return NULL;
}
xbundles = &xcfg->xbundles;
HMAP_FOR_EACH_IN_BUCKET (xbundle, hmap_node, hash_pointer(ofbundle, 0),
xbundles) {
if (xbundle->ofbundle == ofbundle) {
return xbundle;
}
}
return NULL;
}
static struct xport *
xport_lookup(struct xlate_cfg *xcfg, const struct ofport_dpif *ofport)
{
struct hmap *xports;
struct xport *xport;
if (!ofport || !xcfg) {
return NULL;
}
xports = &xcfg->xports;
HMAP_FOR_EACH_IN_BUCKET (xport, hmap_node, hash_pointer(ofport, 0),
xports) {
if (xport->ofport == ofport) {
return xport;
}
}
return NULL;
}
static struct xport *
xport_lookup_by_uuid(struct xlate_cfg *xcfg, const struct uuid *uuid)
{
struct hmap *xports;
struct xport *xport;
if (uuid_is_zero(uuid) || !xcfg) {
return NULL;
}
xports = &xcfg->xports_uuid;
HMAP_FOR_EACH_IN_BUCKET (xport, uuid_node, uuid_hash(uuid), xports) {
if (uuid_equals(&xport->uuid, uuid)) {
return xport;
}
}
return NULL;
}
static struct stp_port *
xport_get_stp_port(const struct xport *xport)
{
return xport->xbridge->stp && xport->stp_port_no != -1
? stp_get_port(xport->xbridge->stp, xport->stp_port_no)
: NULL;
}
static bool
xport_stp_learn_state(const struct xport *xport)
{
struct stp_port *sp = xport_get_stp_port(xport);
return sp
? stp_learn_in_state(stp_port_get_state(sp))
: true;
}
static bool
xport_stp_forward_state(const struct xport *xport)
{
struct stp_port *sp = xport_get_stp_port(xport);
return sp
? stp_forward_in_state(stp_port_get_state(sp))
: true;
}
static bool
xport_stp_should_forward_bpdu(const struct xport *xport)
{
struct stp_port *sp = xport_get_stp_port(xport);
return stp_should_forward_bpdu(sp ? stp_port_get_state(sp) : STP_DISABLED);
}
/* Returns true if STP should process 'flow'. Sets fields in 'wc' that
* were used to make the determination.*/
static bool
stp_should_process_flow(const struct flow *flow, struct flow_wildcards *wc)
{
/* is_stp() also checks dl_type, but dl_type is always set in 'wc'. */
memset(&wc->masks.dl_dst, 0xff, sizeof wc->masks.dl_dst);
return is_stp(flow);
}
static void
stp_process_packet(const struct xport *xport, const struct dp_packet *packet)
{
struct stp_port *sp = xport_get_stp_port(xport);
struct dp_packet payload = *packet;
struct eth_header *eth = dp_packet_data(&payload);
/* Sink packets on ports that have STP disabled when the bridge has
* STP enabled. */
if (!sp || stp_port_get_state(sp) == STP_DISABLED) {
return;
}
/* Trim off padding on payload. */
if (dp_packet_size(&payload) > ntohs(eth->eth_type) + ETH_HEADER_LEN) {
dp_packet_set_size(&payload, ntohs(eth->eth_type) + ETH_HEADER_LEN);
}
if (dp_packet_try_pull(&payload, ETH_HEADER_LEN + LLC_HEADER_LEN)) {
stp_received_bpdu(sp, dp_packet_data(&payload), dp_packet_size(&payload));
}
}
static enum rstp_state
xport_get_rstp_port_state(const struct xport *xport)
{
return xport->rstp_port
? rstp_port_get_state(xport->rstp_port)
: RSTP_DISABLED;
}
static bool
xport_rstp_learn_state(const struct xport *xport)
{
return xport->xbridge->rstp && xport->rstp_port
? rstp_learn_in_state(xport_get_rstp_port_state(xport))
: true;
}
static bool
xport_rstp_forward_state(const struct xport *xport)
{
return xport->xbridge->rstp && xport->rstp_port
? rstp_forward_in_state(xport_get_rstp_port_state(xport))
: true;
}
static bool
xport_rstp_should_manage_bpdu(const struct xport *xport)
{
return rstp_should_manage_bpdu(xport_get_rstp_port_state(xport));
}
static void
rstp_process_packet(const struct xport *xport, const struct dp_packet *packet)
{
struct dp_packet payload = *packet;
struct eth_header *eth = dp_packet_data(&payload);
/* Sink packets on ports that have no RSTP. */
if (!xport->rstp_port) {
return;
}
/* Trim off padding on payload. */
if (dp_packet_size(&payload) > ntohs(eth->eth_type) + ETH_HEADER_LEN) {
dp_packet_set_size(&payload, ntohs(eth->eth_type) + ETH_HEADER_LEN);
}
int len = ETH_HEADER_LEN + LLC_HEADER_LEN;
if (eth->eth_type == htons(ETH_TYPE_VLAN)) {
len += VLAN_HEADER_LEN;
}
if (dp_packet_try_pull(&payload, len)) {
rstp_port_received_bpdu(xport->rstp_port, dp_packet_data(&payload),
dp_packet_size(&payload));
}
}
static struct xport *
get_ofp_port(const struct xbridge *xbridge, ofp_port_t ofp_port)
{
struct xport *xport;
HMAP_FOR_EACH_IN_BUCKET (xport, ofp_node, hash_ofp_port(ofp_port),
&xbridge->xports) {
if (xport->ofp_port == ofp_port) {
return xport;
}
}
return NULL;
}
static odp_port_t
ofp_port_to_odp_port(const struct xbridge *xbridge, ofp_port_t ofp_port)
{
const struct xport *xport = get_ofp_port(xbridge, ofp_port);
return xport ? xport->odp_port : ODPP_NONE;
}
static bool
odp_port_is_alive(const struct xlate_ctx *ctx, ofp_port_t ofp_port)
{
struct xport *xport = get_ofp_port(ctx->xbridge, ofp_port);
return xport && xport->may_enable;
}
static struct ofputil_bucket *
group_first_live_bucket(const struct xlate_ctx *, const struct group_dpif *,
int depth);
static bool
group_is_alive(const struct xlate_ctx *ctx, uint32_t group_id, int depth)
{
struct group_dpif *group;
group = group_dpif_lookup(ctx->xbridge->ofproto, group_id,
ctx->xin->tables_version, false);
if (group) {
return group_first_live_bucket(ctx, group, depth) != NULL;
}
return false;
}
#define MAX_LIVENESS_RECURSION 128 /* Arbitrary limit */
static bool
bucket_is_alive(const struct xlate_ctx *ctx, const struct group_dpif *group,
const struct ofputil_bucket *bucket, int depth)
{
if (depth >= MAX_LIVENESS_RECURSION) {
xlate_report_error(ctx, "bucket chaining exceeded %d links",
MAX_LIVENESS_RECURSION);
return false;
}
/* In "select" groups, buckets with weight 0 are not used.
* In other kinds of groups, weight does not matter. */
if (group->up.type == OFPGT11_SELECT && bucket->weight == 0) {
return false;
}
return (!ofputil_bucket_has_liveness(bucket)
|| (bucket->watch_port != OFPP_ANY
&& bucket->watch_port != OFPP_CONTROLLER
&& odp_port_is_alive(ctx, bucket->watch_port))
|| (bucket->watch_group != OFPG_ANY
&& group_is_alive(ctx, bucket->watch_group, depth + 1))
|| (bucket->watch_port == OFPP_CONTROLLER
&& ofproto_is_alive(&ctx->xbridge->ofproto->up)));
}
static void
xlate_report_bucket_not_live(const struct xlate_ctx *ctx,
const struct ofputil_bucket *bucket)
{
if (OVS_UNLIKELY(ctx->xin->trace)) {
struct ds s = DS_EMPTY_INITIALIZER;
if (bucket->watch_port != OFPP_ANY) {
ds_put_cstr(&s, "port ");
ofputil_format_port(bucket->watch_port, NULL, &s);
}
if (bucket->watch_group != OFPG_ANY) {
if (s.length) {
ds_put_cstr(&s, " and ");
}
ds_put_format(&s, "port %"PRIu32, bucket->watch_group);
}
xlate_report(ctx, OFT_DETAIL, "bucket %"PRIu32": not live due to %s",
bucket->bucket_id, ds_cstr(&s));
ds_destroy(&s);
}
}
static struct ofputil_bucket *
group_first_live_bucket(const struct xlate_ctx *ctx,
const struct group_dpif *group, int depth)
{
struct ofputil_bucket *bucket;
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
if (bucket_is_alive(ctx, group, bucket, depth)) {
return bucket;
}
xlate_report_bucket_not_live(ctx, bucket);
}
return NULL;
}
static struct ofputil_bucket *
group_best_live_bucket(const struct xlate_ctx *ctx,
const struct group_dpif *group,
uint32_t basis)
{
struct ofputil_bucket *best_bucket = NULL;
uint32_t best_score = 0;
struct ofputil_bucket *bucket;
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
if (bucket_is_alive(ctx, group, bucket, 0)) {
uint32_t score =
(hash_int(bucket->bucket_id, basis) & 0xffff) * bucket->weight;
if (score >= best_score) {
best_bucket = bucket;
best_score = score;
}
xlate_report(ctx, OFT_DETAIL, "bucket %"PRIu32": score %"PRIu32,
bucket->bucket_id, score);
} else {
xlate_report_bucket_not_live(ctx, bucket);
}
}
return best_bucket;
}
static bool
xbundle_trunks_vlan(const struct xbundle *bundle, uint16_t vlan)
{
return (bundle->vlan_mode != PORT_VLAN_ACCESS
&& (!bundle->trunks || bitmap_is_set(bundle->trunks, vlan)));
}
static bool
xbundle_allows_cvlan(const struct xbundle *bundle, uint16_t vlan)
{
return (!bundle->cvlans || bitmap_is_set(bundle->cvlans, vlan));
}
static bool
xbundle_includes_vlan(const struct xbundle *xbundle, const struct xvlan *xvlan)
{
switch (xbundle->vlan_mode) {
case PORT_VLAN_ACCESS:
return xvlan->v[0].vid == xbundle->vlan && xvlan->v[1].vid == 0;
case PORT_VLAN_TRUNK:
case PORT_VLAN_NATIVE_UNTAGGED:
case PORT_VLAN_NATIVE_TAGGED:
return xbundle_trunks_vlan(xbundle, xvlan->v[0].vid);
case PORT_VLAN_DOT1Q_TUNNEL:
return xvlan->v[0].vid == xbundle->vlan &&
xbundle_allows_cvlan(xbundle, xvlan->v[1].vid);
default:
OVS_NOT_REACHED();
}
}
static mirror_mask_t
xbundle_mirror_out(const struct xbridge *xbridge, struct xbundle *xbundle)
{
return xbundle != &ofpp_none_bundle
? mirror_bundle_out(xbridge->mbridge, xbundle->ofbundle)
: 0;
}
static mirror_mask_t
xbundle_mirror_src(const struct xbridge *xbridge, struct xbundle *xbundle)
{
return xbundle != &ofpp_none_bundle
? mirror_bundle_src(xbridge->mbridge, xbundle->ofbundle)
: 0;
}
static mirror_mask_t
xbundle_mirror_dst(const struct xbridge *xbridge, struct xbundle *xbundle)
{
return xbundle != &ofpp_none_bundle
? mirror_bundle_dst(xbridge->mbridge, xbundle->ofbundle)
: 0;
}
static struct xbundle *
lookup_input_bundle__(const struct xbridge *xbridge,
ofp_port_t in_port, struct xport **in_xportp)
{
struct xport *xport;
/* Find the port and bundle for the received packet. */
xport = get_ofp_port(xbridge, in_port);
if (in_xportp) {
*in_xportp = xport;
}
if (xport && xport->xbundle) {
return xport->xbundle;
}
/* Special-case OFPP_NONE (OF1.0) and OFPP_CONTROLLER (OF1.1+),
* which a controller may use as the ingress port for traffic that
* it is sourcing. */
if (in_port == OFPP_CONTROLLER || in_port == OFPP_NONE) {
return &ofpp_none_bundle;
}
return NULL;
}
static struct xbundle *
lookup_input_bundle(const struct xlate_ctx *ctx,
ofp_port_t in_port, struct xport **in_xportp)
{
struct xbundle *xbundle = lookup_input_bundle__(ctx->xbridge,
in_port, in_xportp);
if (!xbundle) {
/* Odd. A few possible reasons here:
*
* - We deleted a port but there are still a few packets queued up
* from it.
*
* - Someone externally added a port (e.g. "ovs-dpctl add-if") that
* we don't know about.
*
* - The ofproto client didn't configure the port as part of a bundle.
* This is particularly likely to happen if a packet was received on
* the port after it was created, but before the client had a chance
* to configure its bundle.
*/
xlate_report_error(ctx, "received packet on unknown port %"PRIu32,
in_port);
}
return xbundle;
}
ofproto-dpif-xlate: Don't consider mirrors used when excluded by VLAN. Mirrors can be configured to select packets for mirroring on the basis of multiple criteria: input ports, output ports, and VLANs. A packet P is to be mirrored if there exists a mirror M such that either: - P ingresses on an input port selected by M, or - P egresses on an output port selected by M AND P is in a VLAN selected by M. In addition, every mirror has a destination, which can be an output port or an output VLAN. Either way, if a packet is mirrored to a particular destination, it is done only once, even if different mirrors both select a packet and have the same destination. Since commit efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.), these requirements have been implemented incorrectly: if a packet satisfies one of the bulleted requirements above for mirror M1, but not the VLAN selection requirement for M1, then it was not sent to M's destination, but it was still considered as having been sent to M1's destination for the purpose of avoid output duplication. Thus, if P satisfied *all* of the requirements for a second mirror M2, if M1 and M2 had the same destination, the packet was still not mirrored. This commit fixes that problem. (The issue only occurred if M1 happened to have a smaller index than M2 in OVS's internal data structures. That's just a matter of luck.) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: http://openvswitch.org/pipermail/dev/2016-January/064531.html Fixes: 7efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.) Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-05 19:16:01 -08:00
/* Mirrors the packet represented by 'ctx' to appropriate mirror destinations,
* given the packet is ingressing or egressing on 'xbundle', which has ingress
* or egress (as appropriate) mirrors 'mirrors'. */
static void
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle,
mirror_mask_t mirrors)
{
struct xvlan in_xvlan;
struct xvlan xvlan;
ofproto-dpif-xlate: Don't consider mirrors used when excluded by VLAN. Mirrors can be configured to select packets for mirroring on the basis of multiple criteria: input ports, output ports, and VLANs. A packet P is to be mirrored if there exists a mirror M such that either: - P ingresses on an input port selected by M, or - P egresses on an output port selected by M AND P is in a VLAN selected by M. In addition, every mirror has a destination, which can be an output port or an output VLAN. Either way, if a packet is mirrored to a particular destination, it is done only once, even if different mirrors both select a packet and have the same destination. Since commit efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.), these requirements have been implemented incorrectly: if a packet satisfies one of the bulleted requirements above for mirror M1, but not the VLAN selection requirement for M1, then it was not sent to M's destination, but it was still considered as having been sent to M1's destination for the purpose of avoid output duplication. Thus, if P satisfied *all* of the requirements for a second mirror M2, if M1 and M2 had the same destination, the packet was still not mirrored. This commit fixes that problem. (The issue only occurred if M1 happened to have a smaller index than M2 in OVS's internal data structures. That's just a matter of luck.) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: http://openvswitch.org/pipermail/dev/2016-January/064531.html Fixes: 7efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.) Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-05 19:16:01 -08:00
/* Figure out what VLAN the packet is in (because mirrors can select
* packets on basis of VLAN). */
xvlan_extract(&ctx->xin->flow, &in_xvlan);
if (!input_vid_is_valid(ctx, in_xvlan.v[0].vid, xbundle)) {
return;
}
xvlan_input_translate(xbundle, &in_xvlan, &xvlan);
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
const struct xbridge *xbridge = ctx->xbridge;
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
/* Don't mirror to destinations that we've already mirrored to. */
mirrors &= ~ctx->mirrors;
if (!mirrors) {
return;
}
/* 'mirrors' is a bit-mask of candidates for mirroring. Iterate through
* the candidates, adding the ones that really should be mirrored to
* 'used_mirrors', as long as some candidates remain. */
mirror_mask_t used_mirrors = 0;
while (mirrors) {
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
const unsigned long *vlans;
mirror_mask_t dup_mirrors;
struct ofbundle *out;
int out_vlan;
int snaplen;
ofproto-dpif-xlate: Don't consider mirrors used when excluded by VLAN. Mirrors can be configured to select packets for mirroring on the basis of multiple criteria: input ports, output ports, and VLANs. A packet P is to be mirrored if there exists a mirror M such that either: - P ingresses on an input port selected by M, or - P egresses on an output port selected by M AND P is in a VLAN selected by M. In addition, every mirror has a destination, which can be an output port or an output VLAN. Either way, if a packet is mirrored to a particular destination, it is done only once, even if different mirrors both select a packet and have the same destination. Since commit efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.), these requirements have been implemented incorrectly: if a packet satisfies one of the bulleted requirements above for mirror M1, but not the VLAN selection requirement for M1, then it was not sent to M's destination, but it was still considered as having been sent to M1's destination for the purpose of avoid output duplication. Thus, if P satisfied *all* of the requirements for a second mirror M2, if M1 and M2 had the same destination, the packet was still not mirrored. This commit fixes that problem. (The issue only occurred if M1 happened to have a smaller index than M2 in OVS's internal data structures. That's just a matter of luck.) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: http://openvswitch.org/pipermail/dev/2016-January/064531.html Fixes: 7efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.) Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-05 19:16:01 -08:00
/* Get the details of the mirror represented by the rightmost 1-bit. */
if (OVS_UNLIKELY(!mirror_get(xbridge->mbridge, raw_ctz(mirrors),
&vlans, &dup_mirrors,
&out, &snaplen, &out_vlan))) {
/* The mirror got reconfigured before we got to read it's
* configuration. */
mirrors = zero_rightmost_1bit(mirrors);
continue;
}
ofproto-dpif-xlate: Don't consider mirrors used when excluded by VLAN. Mirrors can be configured to select packets for mirroring on the basis of multiple criteria: input ports, output ports, and VLANs. A packet P is to be mirrored if there exists a mirror M such that either: - P ingresses on an input port selected by M, or - P egresses on an output port selected by M AND P is in a VLAN selected by M. In addition, every mirror has a destination, which can be an output port or an output VLAN. Either way, if a packet is mirrored to a particular destination, it is done only once, even if different mirrors both select a packet and have the same destination. Since commit efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.), these requirements have been implemented incorrectly: if a packet satisfies one of the bulleted requirements above for mirror M1, but not the VLAN selection requirement for M1, then it was not sent to M's destination, but it was still considered as having been sent to M1's destination for the purpose of avoid output duplication. Thus, if P satisfied *all* of the requirements for a second mirror M2, if M1 and M2 had the same destination, the packet was still not mirrored. This commit fixes that problem. (The issue only occurred if M1 happened to have a smaller index than M2 in OVS's internal data structures. That's just a matter of luck.) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: http://openvswitch.org/pipermail/dev/2016-January/064531.html Fixes: 7efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.) Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-05 19:16:01 -08:00
/* If this mirror selects on the basis of VLAN, and it does not select
* 'vlan', then discard this mirror and go on to the next one. */
if (vlans) {
ctx->wc->masks.vlans[0].tci |= htons(VLAN_CFI | VLAN_VID_MASK);
}
if (vlans && !bitmap_is_set(vlans, xvlan.v[0].vid)) {
mirrors = zero_rightmost_1bit(mirrors);
continue;
}
/* We sent a packet to this mirror. */
used_mirrors |= rightmost_1bit(mirrors);
ofproto-dpif-xlate: Don't consider mirrors used when excluded by VLAN. Mirrors can be configured to select packets for mirroring on the basis of multiple criteria: input ports, output ports, and VLANs. A packet P is to be mirrored if there exists a mirror M such that either: - P ingresses on an input port selected by M, or - P egresses on an output port selected by M AND P is in a VLAN selected by M. In addition, every mirror has a destination, which can be an output port or an output VLAN. Either way, if a packet is mirrored to a particular destination, it is done only once, even if different mirrors both select a packet and have the same destination. Since commit efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.), these requirements have been implemented incorrectly: if a packet satisfies one of the bulleted requirements above for mirror M1, but not the VLAN selection requirement for M1, then it was not sent to M's destination, but it was still considered as having been sent to M1's destination for the purpose of avoid output duplication. Thus, if P satisfied *all* of the requirements for a second mirror M2, if M1 and M2 had the same destination, the packet was still not mirrored. This commit fixes that problem. (The issue only occurred if M1 happened to have a smaller index than M2 in OVS's internal data structures. That's just a matter of luck.) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: http://openvswitch.org/pipermail/dev/2016-January/064531.html Fixes: 7efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.) Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-05 19:16:01 -08:00
/* Record the mirror, and the mirrors that output to the same
* destination, so that we don't mirror to them again. This must be
* done now to ensure that output_normal(), below, doesn't recursively
* output to the same mirrors. */
ctx->mirrors |= dup_mirrors;
ctx->mirror_snaplen = snaplen;
ofproto-dpif-xlate: Don't consider mirrors used when excluded by VLAN. Mirrors can be configured to select packets for mirroring on the basis of multiple criteria: input ports, output ports, and VLANs. A packet P is to be mirrored if there exists a mirror M such that either: - P ingresses on an input port selected by M, or - P egresses on an output port selected by M AND P is in a VLAN selected by M. In addition, every mirror has a destination, which can be an output port or an output VLAN. Either way, if a packet is mirrored to a particular destination, it is done only once, even if different mirrors both select a packet and have the same destination. Since commit efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.), these requirements have been implemented incorrectly: if a packet satisfies one of the bulleted requirements above for mirror M1, but not the VLAN selection requirement for M1, then it was not sent to M's destination, but it was still considered as having been sent to M1's destination for the purpose of avoid output duplication. Thus, if P satisfied *all* of the requirements for a second mirror M2, if M1 and M2 had the same destination, the packet was still not mirrored. This commit fixes that problem. (The issue only occurred if M1 happened to have a smaller index than M2 in OVS's internal data structures. That's just a matter of luck.) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: http://openvswitch.org/pipermail/dev/2016-January/064531.html Fixes: 7efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.) Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-05 19:16:01 -08:00
/* Send the packet to the mirror. */
if (out) {
struct xbundle *out_xbundle = xbundle_lookup(ctx->xcfg, out);
if (out_xbundle) {
output_normal(ctx, out_xbundle, &xvlan);
}
} else if (xvlan.v[0].vid != out_vlan
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
&& !eth_addr_is_reserved(ctx->xin->flow.dl_dst)) {
struct xbundle *xb;
uint16_t old_vid = xvlan.v[0].vid;
xvlan.v[0].vid = out_vlan;
LIST_FOR_EACH (xb, list_node, &xbridge->xbundles) {
if (xbundle_includes_vlan(xb, &xvlan)
&& !xbundle_mirror_out(xbridge, xb)) {
output_normal(ctx, xb, &xvlan);
}
}
xvlan.v[0].vid = old_vid;
}
ofproto-dpif-xlate: Don't consider mirrors used when excluded by VLAN. Mirrors can be configured to select packets for mirroring on the basis of multiple criteria: input ports, output ports, and VLANs. A packet P is to be mirrored if there exists a mirror M such that either: - P ingresses on an input port selected by M, or - P egresses on an output port selected by M AND P is in a VLAN selected by M. In addition, every mirror has a destination, which can be an output port or an output VLAN. Either way, if a packet is mirrored to a particular destination, it is done only once, even if different mirrors both select a packet and have the same destination. Since commit efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.), these requirements have been implemented incorrectly: if a packet satisfies one of the bulleted requirements above for mirror M1, but not the VLAN selection requirement for M1, then it was not sent to M's destination, but it was still considered as having been sent to M1's destination for the purpose of avoid output duplication. Thus, if P satisfied *all* of the requirements for a second mirror M2, if M1 and M2 had the same destination, the packet was still not mirrored. This commit fixes that problem. (The issue only occurred if M1 happened to have a smaller index than M2 in OVS's internal data structures. That's just a matter of luck.) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: http://openvswitch.org/pipermail/dev/2016-January/064531.html Fixes: 7efbc3b7c4006c (ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation.) Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-05 19:16:01 -08:00
/* output_normal() could have recursively output (to different
* mirrors), so make sure that we don't send duplicates. */
mirrors &= ~ctx->mirrors;
ctx->mirror_snaplen = 0;
}
if (used_mirrors) {
if (ctx->xin->resubmit_stats) {
mirror_update_stats(xbridge->mbridge, used_mirrors,
ctx->xin->resubmit_stats->n_packets,
ctx->xin->resubmit_stats->n_bytes);
}
if (ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_MIRROR);
entry->mirror.mbridge = mbridge_ref(xbridge->mbridge);
entry->mirror.mirrors = used_mirrors;
}
}
}
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
static void
mirror_ingress_packet(struct xlate_ctx *ctx)
{
if (mbridge_has_mirrors(ctx->xbridge->mbridge)) {
struct xbundle *xbundle = lookup_input_bundle(
ctx, ctx->xin->flow.in_port.ofp_port, NULL);
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
if (xbundle) {
mirror_packet(ctx, xbundle,
xbundle_mirror_src(ctx->xbridge, xbundle));
}
}
}
/* Checks whether a packet with the given 'vid' may ingress on 'in_xbundle'.
* If so, returns true. Otherwise, returns false.
*
* 'vid' should be the VID obtained from the 802.1Q header that was received as
* part of a packet (specify 0 if there was no 802.1Q header), in the range
* 0...4095. */
static bool
input_vid_is_valid(const struct xlate_ctx *ctx,
uint16_t vid, struct xbundle *in_xbundle)
{
/* Allow any VID on the OFPP_NONE port. */
if (in_xbundle == &ofpp_none_bundle) {
return true;
}
switch (in_xbundle->vlan_mode) {
case PORT_VLAN_ACCESS:
if (vid) {
xlate_report_error(ctx, "dropping VLAN %"PRIu16" tagged "
"packet received on port %s configured as VLAN "
"%d access port", vid, in_xbundle->name,
in_xbundle->vlan);
return false;
}
return true;
case PORT_VLAN_NATIVE_UNTAGGED:
case PORT_VLAN_NATIVE_TAGGED:
if (!vid) {
/* Port must always carry its native VLAN. */
return true;
}
/* Fall through. */
case PORT_VLAN_TRUNK:
if (!xbundle_trunks_vlan(in_xbundle, vid)) {
xlate_report_error(ctx, "dropping VLAN %"PRIu16" packet "
"received on port %s not configured for "
"trunking VLAN %"PRIu16,
vid, in_xbundle->name, vid);
return false;
}
return true;
case PORT_VLAN_DOT1Q_TUNNEL:
if (!xbundle_allows_cvlan(in_xbundle, vid)) {
xlate_report_error(ctx, "dropping VLAN %"PRIu16" packet received "
"on dot1q-tunnel port %s that excludes this "
"VLAN", vid, in_xbundle->name);
return false;
}
return true;
default:
OVS_NOT_REACHED();
}
}
static void
xvlan_copy(struct xvlan *dst, const struct xvlan *src)
{
*dst = *src;
}
static void
xvlan_pop(struct xvlan *src)
{
memmove(&src->v[0], &src->v[1], sizeof(src->v) - sizeof(src->v[0]));
memset(&src->v[FLOW_MAX_VLAN_HEADERS - 1], 0,
sizeof(src->v[FLOW_MAX_VLAN_HEADERS - 1]));
}
static void
xvlan_push_uninit(struct xvlan *src)
{
memmove(&src->v[1], &src->v[0], sizeof(src->v) - sizeof(src->v[0]));
memset(&src->v[0], 0, sizeof(src->v[0]));
}
/* Extract VLAN information (headers) from flow */
static void
xvlan_extract(const struct flow *flow, struct xvlan *xvlan)
{
int i;
memset(xvlan, 0, sizeof(*xvlan));
for (i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) {
if (!eth_type_vlan(flow->vlans[i].tpid) ||
!(flow->vlans[i].tci & htons(VLAN_CFI))) {
break;
}
xvlan->v[i].tpid = ntohs(flow->vlans[i].tpid);
xvlan->v[i].vid = vlan_tci_to_vid(flow->vlans[i].tci);
xvlan->v[i].pcp = ntohs(flow->vlans[i].tci) & VLAN_PCP_MASK;
}
}
/* Put VLAN information (headers) to flow */
static void
xvlan_put(struct flow *flow, const struct xvlan *xvlan,
enum port_priority_tags_mode use_priority_tags)
{
ovs_be16 tci;
int i;
for (i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) {
tci = htons(xvlan->v[i].vid | (xvlan->v[i].pcp & VLAN_PCP_MASK));
if (tci || ((use_priority_tags == PORT_PRIORITY_TAGS_ALWAYS) &&
xvlan->v[i].tpid)) {
tci |= htons(VLAN_CFI);
flow->vlans[i].tpid = xvlan->v[i].tpid ?
htons(xvlan->v[i].tpid) :
htons(ETH_TYPE_VLAN_8021Q);
}
flow->vlans[i].tci = tci;
}
}
/* Given 'in_xvlan', extracted from the input 802.1Q headers received as part
* of a packet, and 'in_xbundle', the bundle on which the packet was received,
* returns the VLANs of the packet during bridge internal processing. */
static void
xvlan_input_translate(const struct xbundle *in_xbundle,
const struct xvlan *in_xvlan, struct xvlan *xvlan)
{
switch (in_xbundle->vlan_mode) {
case PORT_VLAN_ACCESS:
memset(xvlan, 0, sizeof(*xvlan));
xvlan->v[0].tpid = in_xvlan->v[0].tpid ? in_xvlan->v[0].tpid :
ETH_TYPE_VLAN_8021Q;
xvlan->v[0].vid = in_xbundle->vlan;
xvlan->v[0].pcp = in_xvlan->v[0].pcp;
break;
case PORT_VLAN_TRUNK:
xvlan_copy(xvlan, in_xvlan);
break;
case PORT_VLAN_NATIVE_UNTAGGED:
case PORT_VLAN_NATIVE_TAGGED:
xvlan_copy(xvlan, in_xvlan);
if (!in_xvlan->v[0].vid) {
xvlan->v[0].tpid = in_xvlan->v[0].tpid ? in_xvlan->v[0].tpid :
ETH_TYPE_VLAN_8021Q;
xvlan->v[0].vid = in_xbundle->vlan;
xvlan->v[0].pcp = in_xvlan->v[0].pcp;
}
break;
case PORT_VLAN_DOT1Q_TUNNEL:
xvlan_copy(xvlan, in_xvlan);
xvlan_push_uninit(xvlan);
xvlan->v[0].tpid = in_xbundle->qinq_ethtype;
xvlan->v[0].vid = in_xbundle->vlan;
xvlan->v[0].pcp = 0;
break;
default:
OVS_NOT_REACHED();
}
}
/* Given 'xvlan', the VLANs of a packet during internal processing, and
* 'out_xbundle', a bundle on which the packet is to be output, returns the
* VLANs that should be included in output packet. */
static void
xvlan_output_translate(const struct xbundle *out_xbundle,
const struct xvlan *xvlan, struct xvlan *out_xvlan)
{
switch (out_xbundle->vlan_mode) {
case PORT_VLAN_ACCESS:
memset(out_xvlan, 0, sizeof(*out_xvlan));
break;
case PORT_VLAN_TRUNK:
case PORT_VLAN_NATIVE_TAGGED:
xvlan_copy(out_xvlan, xvlan);
break;
case PORT_VLAN_NATIVE_UNTAGGED:
xvlan_copy(out_xvlan, xvlan);
if (xvlan->v[0].vid == out_xbundle->vlan) {
xvlan_pop(out_xvlan);
}
break;
case PORT_VLAN_DOT1Q_TUNNEL:
xvlan_copy(out_xvlan, xvlan);
xvlan_pop(out_xvlan);
break;
default:
OVS_NOT_REACHED();
}
}
/* If output xbundle is dot1q-tunnel, set mask bits of cvlan */
static void
check_and_set_cvlan_mask(struct flow_wildcards *wc,
const struct xbundle *xbundle)
{
if (xbundle->vlan_mode == PORT_VLAN_DOT1Q_TUNNEL && xbundle->cvlans) {
wc->masks.vlans[1].tci = htons(0xffff);
}
}
static void
output_normal(struct xlate_ctx *ctx, const struct xbundle *out_xbundle,
const struct xvlan *xvlan)
{
uint16_t vid;
union flow_vlan_hdr old_vlans[FLOW_MAX_VLAN_HEADERS];
struct xport *xport;
struct xlate_bond_recirc xr;
bool use_recirc = false;
struct xvlan out_xvlan;
check_and_set_cvlan_mask(ctx->wc, out_xbundle);
xvlan_output_translate(out_xbundle, xvlan, &out_xvlan);
if (out_xbundle->use_priority_tags) {
out_xvlan.v[0].pcp = ntohs(ctx->xin->flow.vlans[0].tci) &
VLAN_PCP_MASK;
}
vid = out_xvlan.v[0].vid;
if (ovs_list_is_empty(&out_xbundle->xports)) {
/* Partially configured bundle with no members. Drop the packet. */
return;
} else if (!out_xbundle->bond) {
xport = CONTAINER_OF(ovs_list_front(&out_xbundle->xports), struct xport,
bundle_node);
} else {
struct flow_wildcards *wc = ctx->wc;
struct ofport_dpif *ofport;
if (ctx->xbridge->support.odp.recirc) {
/* In case recirculation is not actually in use, 'xr.recirc_id'
* will be set to '0', since a valid 'recirc_id' can
ofproto/bond: Fix bond reconfiguration race condition. During the upcall thread bond output translation, bond_may_recirc() is currently called outside the lock. In case the main thread executes bond_reconfigure() at the same time, the upcall thread may find bond state to be inconsistent when calling bond_update_post_recirc_rules(). This patch fixes the race condition by acquiring the write lock before calling bond_may_recirc(). The APIs are refactored slightly. The race condition can result in the following stack trace. Copied from 'Reported-at': Thread 23 handler69: Invalid write of size 8 update_recirc_rules (bond.c:385) bond_update_post_recirc_rules__ (bond.c:952) bond_update_post_recirc_rules (bond.c:960) output_normal (ofproto-dpif-xlate.c:2102) xlate_normal (ofproto-dpif-xlate.c:2858) xlate_output_action (ofproto-dpif-xlate.c:4407) do_xlate_actions (ofproto-dpif-xlate.c:5335) xlate_actions (ofproto-dpif-xlate.c:6198) upcall_xlate (ofproto-dpif-upcall.c:1129) process_upcall (ofproto-dpif-upcall.c:1271) recv_upcalls (ofproto-dpif-upcall.c:822) udpif_upcall_handler (ofproto-dpif-upcall.c:740) Address 0x18630490 is 1,904 bytes inside a block of size 12,288 free'd free (vg_replace_malloc.c:529) bond_entry_reset (bond.c:1635) bond_reconfigure (bond.c:457) bundle_set (ofproto-dpif.c:2896) ofproto_bundle_register (ofproto.c:1343) port_configure (bridge.c:1159) bridge_reconfigure (bridge.c:785) bridge_run (bridge.c:3099) main (ovs-vswitchd.c:111) Block was alloc'd at malloc (vg_replace_malloc.c:298) xmalloc (util.c:110) bond_entry_reset (bond.c:1629) bond_reconfigure (bond.c:457) bond_create (bond.c:245) bundle_set (ofproto-dpif.c:2900) ofproto_bundle_register (ofproto.c:1343) port_configure (bridge.c:1159) bridge_reconfigure (bridge.c:785) bridge_run (bridge.c:3099) main (ovs-vswitchd.c:111) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/328969.html CC: Huanle Han <hanxueluo@gmail.com> Signed-off-by: Andy Zhou <azhou@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org> Acked-by: Huanle Han <hanxueluo@gmail.com>
2017-02-22 23:31:31 -08:00
* not be zero. */
bond: Avoid deadlock while updating post recirculation rules. If the PACKET_OUT from controller ends up with sending packet to a bond interface, the main thread will take locks in the following order: handle_openflow --> take ofproto_mutex handle_packet_out packet_xlate output_normal bond_update_post_recirc_rules --> take rwlock in bond.c If at the same time revalidator thread is processing other packet with the output to the same bond: xlate_actions output_normal bond_update_post_recirc_rules --> take rwlock in bond.c update_recirc_rules ofproto_dpif_add_internal_flow ofproto_flow_mod --> take ofproto_mutex So, it is possible for these 2 threads to lock each other by taking one lock and waiting for another thread to release the second lock. It is also possible for the main thread to lock itself up by trying to acquire ofproto_mutex for the second time, if it will actually proceed with update_recirc_rules() after taking the bond rwlock. The problem appears to be that bond_update_post_recirc_rules() is called during the flow translation even if side effects are prohibited, which is the case for openflow PACKET_OUT handling. Skipping actual flow updates during the flow translation if side effects are disabled to avoid the deadlock. Since flows are not installed now when actions translated for very first packet, installing initial flows in bond_reconfigure(). This will cover the case of allocating a new recirc_id. Also checking if we need to update flows in bond_run() to cover link state changes. Regression test is added to catch the double lock case. Reported-at: https://github.com/openvswitch/ovs-issues/issues/259 Reported-by: Daniel Ding <zhihui.ding@easystack.cn> Fixes: adcf00ba35a0 ("ofproto/bond: Implement bond megaflow using recirculation") Acked-by: Mike Pattrick <mkp@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-09-13 21:08:52 +02:00
if (ctx->xin->allow_side_effects) {
bond_update_post_recirc_rules(out_xbundle->bond,
&xr.recirc_id,
&xr.hash_basis);
} else {
/* If side effects are not allowed, only getting the bond
* configuration. Rule updates will be handled by the
* main thread later. */
bond_get_recirc_id_and_hash_basis(out_xbundle->bond,
&xr.recirc_id,
&xr.hash_basis);
}
ofproto/bond: Fix bond reconfiguration race condition. During the upcall thread bond output translation, bond_may_recirc() is currently called outside the lock. In case the main thread executes bond_reconfigure() at the same time, the upcall thread may find bond state to be inconsistent when calling bond_update_post_recirc_rules(). This patch fixes the race condition by acquiring the write lock before calling bond_may_recirc(). The APIs are refactored slightly. The race condition can result in the following stack trace. Copied from 'Reported-at': Thread 23 handler69: Invalid write of size 8 update_recirc_rules (bond.c:385) bond_update_post_recirc_rules__ (bond.c:952) bond_update_post_recirc_rules (bond.c:960) output_normal (ofproto-dpif-xlate.c:2102) xlate_normal (ofproto-dpif-xlate.c:2858) xlate_output_action (ofproto-dpif-xlate.c:4407) do_xlate_actions (ofproto-dpif-xlate.c:5335) xlate_actions (ofproto-dpif-xlate.c:6198) upcall_xlate (ofproto-dpif-upcall.c:1129) process_upcall (ofproto-dpif-upcall.c:1271) recv_upcalls (ofproto-dpif-upcall.c:822) udpif_upcall_handler (ofproto-dpif-upcall.c:740) Address 0x18630490 is 1,904 bytes inside a block of size 12,288 free'd free (vg_replace_malloc.c:529) bond_entry_reset (bond.c:1635) bond_reconfigure (bond.c:457) bundle_set (ofproto-dpif.c:2896) ofproto_bundle_register (ofproto.c:1343) port_configure (bridge.c:1159) bridge_reconfigure (bridge.c:785) bridge_run (bridge.c:3099) main (ovs-vswitchd.c:111) Block was alloc'd at malloc (vg_replace_malloc.c:298) xmalloc (util.c:110) bond_entry_reset (bond.c:1629) bond_reconfigure (bond.c:457) bond_create (bond.c:245) bundle_set (ofproto-dpif.c:2900) ofproto_bundle_register (ofproto.c:1343) port_configure (bridge.c:1159) bridge_reconfigure (bridge.c:785) bridge_run (bridge.c:3099) main (ovs-vswitchd.c:111) Reported-by: Huanle Han <hanxueluo@gmail.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/328969.html CC: Huanle Han <hanxueluo@gmail.com> Signed-off-by: Andy Zhou <azhou@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org> Acked-by: Huanle Han <hanxueluo@gmail.com>
2017-02-22 23:31:31 -08:00
if (xr.recirc_id) {
/* Use recirculation instead of output. */
use_recirc = true;
xr.hash_alg = OVS_HASH_ALG_L4;
/* Recirculation does not require unmasking hash fields. */
wc = NULL;
}
}
ofport = bond_choose_output_member(out_xbundle->bond,
&ctx->xin->flow, wc, vid);
xport = xport_lookup(ctx->xcfg, ofport);
if (!xport) {
/* No member interfaces enabled, so drop packet. */
return;
}
/* If use_recirc is set, the main thread will handle stats
* accounting for this bond. */
if (!use_recirc) {
if (ctx->xin->resubmit_stats) {
bond_account(out_xbundle->bond, &ctx->xin->flow, vid,
ctx->xin->resubmit_stats->n_bytes);
}
if (ctx->xin->xcache) {
struct xc_entry *entry;
struct flow *flow;
flow = &ctx->xin->flow;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_BOND);
entry->bond.bond = bond_ref(out_xbundle->bond);
entry->bond.flow = xmemdup(flow, sizeof *flow);
entry->bond.vid = vid;
}
}
}
memcpy(&old_vlans, &ctx->xin->flow.vlans, sizeof(old_vlans));
xvlan_put(&ctx->xin->flow, &out_xvlan, out_xbundle->use_priority_tags);
compose_output_action(ctx, xport->ofp_port, use_recirc ? &xr : NULL,
false, false);
memcpy(&ctx->xin->flow.vlans, &old_vlans, sizeof(old_vlans));
}
/* A VM broadcasts a gratuitous ARP to indicate that it has resumed after
* migration. Older Citrix-patched Linux DomU used gratuitous ARP replies to
* indicate this; newer upstream kernels use gratuitous ARP requests. */
static bool
is_gratuitous_arp(const struct flow *flow, struct flow_wildcards *wc)
{
if (flow->dl_type != htons(ETH_TYPE_ARP)) {
return false;
}
memset(&wc->masks.dl_dst, 0xff, sizeof wc->masks.dl_dst);
if (!eth_addr_is_broadcast(flow->dl_dst)) {
return false;
}
memset(&wc->masks.nw_proto, 0xff, sizeof wc->masks.nw_proto);
if (flow->nw_proto == ARP_OP_REPLY) {
return true;
} else if (flow->nw_proto == ARP_OP_REQUEST) {
memset(&wc->masks.nw_src, 0xff, sizeof wc->masks.nw_src);
memset(&wc->masks.nw_dst, 0xff, sizeof wc->masks.nw_dst);
return flow->nw_src == flow->nw_dst;
} else {
return false;
}
}
/* Determines whether packets in 'flow' within 'xbridge' should be forwarded or
* dropped. Returns true if they may be forwarded, false if they should be
* dropped.
*
* 'in_port' must be the xport that corresponds to flow->in_port.
* 'in_port' must be part of a bundle (e.g. in_port->bundle must be nonnull).
*
* 'vlan' must be the VLAN that corresponds to flow->vlan_tci on 'in_port', as
* returned by input_vid_to_vlan(). It must be a valid VLAN for 'in_port', as
* checked by input_vid_is_valid().
*
* May also add tags to '*tags', although the current implementation only does
* so in one special case.
*/
static bool
is_admissible(struct xlate_ctx *ctx, struct xport *in_port,
uint16_t vlan)
{
struct xbundle *in_xbundle = in_port->xbundle;
const struct xbridge *xbridge = ctx->xbridge;
struct flow *flow = &ctx->xin->flow;
/* Drop frames for reserved multicast addresses
* only if forward_bpdu option is absent. */
if (!xbridge->forward_bpdu && eth_addr_is_reserved(flow->dl_dst)) {
xlate_report(ctx, OFT_DETAIL,
"packet has reserved destination MAC, dropping");
return false;
}
if (in_xbundle->bond) {
struct mac_entry *mac;
switch (bond_check_admissibility(in_xbundle->bond, in_port->ofport,
flow->dl_dst)) {
case BV_ACCEPT:
break;
case BV_DROP:
xlate_report(ctx, OFT_DETAIL,
"bonding refused admissibility, dropping");
return false;
case BV_DROP_IF_MOVED:
ovs_rwlock_rdlock(&xbridge->ml->rwlock);
mac = mac_learning_lookup(xbridge->ml, flow->dl_src, vlan);
mac-learning: Implement per-port MAC learning fairness. In "MAC flooding", an attacker transmits an overwhelming number of frames with unique Ethernet source address on a switch port. The goal is to force the switch to evict all useful MAC learning table entries, so that its behavior degenerates to that of a hub, flooding all traffic. In turn, that allows an attacker to eavesdrop on the traffic of other hosts attached to the switch, with all the risks that that entails. Before this commit, the Open vSwitch "normal" action that implements its standalone switch behavior (and that can be used by OpenFlow controllers as well) was vulnerable to MAC flooding attacks. This commit fixes the problem by implementing per-port fairness for MAC table entries: when the MAC table is at its maximum size, MAC table eviction always deletes an entry from the port with the most entries. Thus, MAC entries will never be evicted from ports with only a few entries if a port with a huge number of entries exists. Controllers could introduce their own MAC flooding vulnerabilities into OVS. For a controller that adds destination MAC based flows to an OpenFlow flow table as a reaction to "packet-in" events, such a bug, if it exists, would be in the controller code itself and would need to be fixed in the controller. For a controller that relies on the Open vSwitch "learn" action to add destination MAC based flows, Open vSwitch has existing support for eviction policy similar to that implemented in this commit through the "groups" column in the Flow_Table table documented in ovs-vswitchd.conf.db(5); we recommend that users of "learn" not already familiar with eviction groups to read that documentation. In addition to implementation of per-port MAC learning fairness, this commit includes some closely related changes: - Access to client-provided "port" data in struct mac_entry is now abstracted through helper functions, which makes it easier to ensure that the per-port data structures are maintained consistently. - The mac_learning_changed() function, which had become trivial, vestigial, and confusing, was removed. Its functionality was folded into the new function mac_entry_set_port(). - Many comments were added and improved; there had been a lot of comment rot in previous versions. CERT: VU#784996 Reported-by: "Ronny L. Bull - bullrl" <bullrl@clarkson.edu> Reported-at: http://www.irongeek.com/i.php?page=videos/derbycon4/t314-exploring-layer-2-network-security-in-virtualized-environments-ronny-l-bull-dr-jeanna-n-matthews Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Ethan Jackson <ethan@nicira.com>
2015-02-11 23:34:50 -08:00
if (mac
&& mac_entry_get_port(xbridge->ml, mac) != in_xbundle->ofbundle
&& (!is_gratuitous_arp(flow, ctx->wc)
mac-learning: Implement per-port MAC learning fairness. In "MAC flooding", an attacker transmits an overwhelming number of frames with unique Ethernet source address on a switch port. The goal is to force the switch to evict all useful MAC learning table entries, so that its behavior degenerates to that of a hub, flooding all traffic. In turn, that allows an attacker to eavesdrop on the traffic of other hosts attached to the switch, with all the risks that that entails. Before this commit, the Open vSwitch "normal" action that implements its standalone switch behavior (and that can be used by OpenFlow controllers as well) was vulnerable to MAC flooding attacks. This commit fixes the problem by implementing per-port fairness for MAC table entries: when the MAC table is at its maximum size, MAC table eviction always deletes an entry from the port with the most entries. Thus, MAC entries will never be evicted from ports with only a few entries if a port with a huge number of entries exists. Controllers could introduce their own MAC flooding vulnerabilities into OVS. For a controller that adds destination MAC based flows to an OpenFlow flow table as a reaction to "packet-in" events, such a bug, if it exists, would be in the controller code itself and would need to be fixed in the controller. For a controller that relies on the Open vSwitch "learn" action to add destination MAC based flows, Open vSwitch has existing support for eviction policy similar to that implemented in this commit through the "groups" column in the Flow_Table table documented in ovs-vswitchd.conf.db(5); we recommend that users of "learn" not already familiar with eviction groups to read that documentation. In addition to implementation of per-port MAC learning fairness, this commit includes some closely related changes: - Access to client-provided "port" data in struct mac_entry is now abstracted through helper functions, which makes it easier to ensure that the per-port data structures are maintained consistently. - The mac_learning_changed() function, which had become trivial, vestigial, and confusing, was removed. Its functionality was folded into the new function mac_entry_set_port(). - Many comments were added and improved; there had been a lot of comment rot in previous versions. CERT: VU#784996 Reported-by: "Ronny L. Bull - bullrl" <bullrl@clarkson.edu> Reported-at: http://www.irongeek.com/i.php?page=videos/derbycon4/t314-exploring-layer-2-network-security-in-virtualized-environments-ronny-l-bull-dr-jeanna-n-matthews Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Ethan Jackson <ethan@nicira.com>
2015-02-11 23:34:50 -08:00
|| mac_entry_is_grat_arp_locked(mac))) {
ovs_rwlock_unlock(&xbridge->ml->rwlock);
xlate_report(ctx, OFT_DETAIL,
"SLB bond thinks this packet looped back, "
"dropping");
return false;
}
ovs_rwlock_unlock(&xbridge->ml->rwlock);
break;
}
}
return true;
}
static bool
update_learning_table__(const struct xbridge *xbridge,
struct xbundle *in_xbundle, struct eth_addr dl_src,
int vlan, bool is_grat_arp)
{
return (in_xbundle == &ofpp_none_bundle
|| !mac_learning_update(xbridge->ml, dl_src, vlan,
is_grat_arp,
in_xbundle->bond != NULL,
in_xbundle->ofbundle));
}
static void
update_learning_table(const struct xlate_ctx *ctx,
struct xbundle *in_xbundle, struct eth_addr dl_src,
int vlan, bool is_grat_arp)
{
if (!update_learning_table__(ctx->xbridge, in_xbundle, dl_src, vlan,
is_grat_arp)) {
xlate_report_debug(ctx, OFT_DETAIL, "learned that "ETH_ADDR_FMT" is "
"on port %s in VLAN %d",
ETH_ADDR_ARGS(dl_src), in_xbundle->name, vlan);
}
}
/* Updates multicast snooping table 'ms' given that a packet matching 'flow'
* was received on 'in_xbundle' in 'vlan' and is either Report or Query. */
static void
update_mcast_snooping_table4__(const struct xlate_ctx *ctx,
const struct flow *flow,
struct mcast_snooping *ms, int vlan,
struct xbundle *in_xbundle,
const struct dp_packet *packet)
OVS_REQ_WRLOCK(ms->rwlock)
{
const struct igmp_header *igmp;
int count;
size_t offset;
ovs_be32 ip4 = flow->igmp_group_ip4;
offset = (char *) dp_packet_l4(packet) - (char *) dp_packet_data(packet);
igmp = dp_packet_at(packet, offset, IGMP_HEADER_LEN);
if (!igmp || csum(igmp, dp_packet_l4_size(packet)) != 0) {
xlate_report_debug(ctx, OFT_DETAIL,
"multicast snooping received bad IGMP "
"checksum on port %s in VLAN %d",
in_xbundle->name, vlan);
return;
}
switch (ntohs(flow->tp_src)) {
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
if (mcast_snooping_add_group4(ms, ip4, vlan, in_xbundle->ofbundle)) {
xlate_report_debug(ctx, OFT_DETAIL,
"multicast snooping learned that "
IP_FMT" is on port %s in VLAN %d",
IP_ARGS(ip4), in_xbundle->name, vlan);
}
break;
case IGMP_HOST_LEAVE_MESSAGE:
if (mcast_snooping_leave_group4(ms, ip4, vlan, in_xbundle->ofbundle)) {
xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping leaving "
IP_FMT" is on port %s in VLAN %d",
IP_ARGS(ip4), in_xbundle->name, vlan);
}
break;
case IGMP_HOST_MEMBERSHIP_QUERY:
if (flow->nw_src && mcast_snooping_add_mrouter(ms, vlan,
in_xbundle->ofbundle)) {
xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping query "
"from "IP_FMT" is on port %s in VLAN %d",
IP_ARGS(flow->nw_src), in_xbundle->name, vlan);
}
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
count = mcast_snooping_add_report(ms, packet, vlan,
in_xbundle->ofbundle);
if (count) {
xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping processed "
"%d addresses on port %s in VLAN %d",
count, in_xbundle->name, vlan);
}
break;
}
}
static void
update_mcast_snooping_table6__(const struct xlate_ctx *ctx,
const struct flow *flow,
struct mcast_snooping *ms, int vlan,
struct xbundle *in_xbundle,
const struct dp_packet *packet)
OVS_REQ_WRLOCK(ms->rwlock)
{
const struct mld_header *mld;
int count;
size_t offset;
offset = (char *) dp_packet_l4(packet) - (char *) dp_packet_data(packet);
mld = dp_packet_at(packet, offset, MLD_HEADER_LEN);
if (!mld ||
packet_csum_upperlayer6(dp_packet_l3(packet),
mld, IPPROTO_ICMPV6,
dp_packet_l4_size(packet)) != 0) {
xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping received "
"bad MLD checksum on port %s in VLAN %d",
in_xbundle->name, vlan);
return;
}
switch (ntohs(flow->tp_src)) {
case MLD_QUERY:
if (!ipv6_addr_equals(&flow->ipv6_src, &in6addr_any)
&& mcast_snooping_add_mrouter(ms, vlan, in_xbundle->ofbundle)) {
xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping query on "
"port %s in VLAN %d", in_xbundle->name, vlan);
}
break;
case MLD_REPORT:
case MLD_DONE:
case MLD2_REPORT:
count = mcast_snooping_add_mld(ms, packet, vlan, in_xbundle->ofbundle);
if (count) {
xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping processed "
"%d addresses on port %s in VLAN %d",
count, in_xbundle->name, vlan);
}
break;
}
}
/* Updates multicast snooping table 'ms' given that a packet matching 'flow'
* was received on 'in_xbundle' in 'vlan'. */
static void
update_mcast_snooping_table(const struct xlate_ctx *ctx,
const struct flow *flow, int vlan,
struct xbundle *in_xbundle,
const struct dp_packet *packet)
{
struct mcast_snooping *ms = ctx->xbridge->ms;
struct xbundle *mcast_xbundle;
struct mcast_port_bundle *fport;
/* Don't learn the OFPP_NONE port. */
if (in_xbundle == &ofpp_none_bundle) {
return;
}
/* Don't learn from flood ports */
mcast_xbundle = NULL;
ovs_rwlock_wrlock(&ms->rwlock);
LIST_FOR_EACH(fport, node, &ms->fport_list) {
mcast_xbundle = xbundle_lookup(ctx->xcfg, fport->port);
if (mcast_xbundle == in_xbundle) {
break;
}
}
if (!mcast_xbundle || mcast_xbundle != in_xbundle) {
if (flow->dl_type == htons(ETH_TYPE_IP)) {
update_mcast_snooping_table4__(ctx, flow, ms, vlan,
in_xbundle, packet);
} else {
update_mcast_snooping_table6__(ctx, flow, ms, vlan,
in_xbundle, packet);
}
}
ovs_rwlock_unlock(&ms->rwlock);
}
/* A list of multicast output ports.
*
* We accumulate output ports and then do all the outputs afterward. It would
* be more natural to do the outputs one at a time as we discover the need for
* each one, but this can cause a deadlock because we need to take the
* mcast_snooping's rwlock for reading to iterate through the port lists and
* doing an output, if it goes to a patch port, can eventually come back to the
* same mcast_snooping and attempt to take the write lock (see
* https://github.com/openvswitch/ovs-issues/issues/153). */
struct mcast_output {
/* Discrete ports. */
struct xbundle **xbundles;
size_t n, allocated;
/* If set, flood to all ports. */
bool flood;
};
#define MCAST_OUTPUT_INIT { NULL, 0, 0, false }
/* Add 'mcast_bundle' to 'out'. */
static void
mcast_output_add(struct mcast_output *out, struct xbundle *mcast_xbundle)
{
if (out->n >= out->allocated) {
out->xbundles = x2nrealloc(out->xbundles, &out->allocated,
sizeof *out->xbundles);
}
out->xbundles[out->n++] = mcast_xbundle;
}
/* Outputs the packet in 'ctx' to all of the output ports in 'out', given input
* bundle 'in_xbundle' and the current 'xvlan'. */
static void
mcast_output_finish(struct xlate_ctx *ctx, struct mcast_output *out,
struct xbundle *in_xbundle, struct xvlan *xvlan)
{
if (out->flood) {
xlate_normal_flood(ctx, in_xbundle, xvlan);
} else {
for (size_t i = 0; i < out->n; i++) {
output_normal(ctx, out->xbundles[i], xvlan);
}
}
free(out->xbundles);
}
/* send the packet to ports having the multicast group learned */
static void
xlate_normal_mcast_send_group(struct xlate_ctx *ctx,
struct mcast_snooping *ms OVS_UNUSED,
struct mcast_group *grp,
struct xbundle *in_xbundle,
struct mcast_output *out)
OVS_REQ_RDLOCK(ms->rwlock)
{
struct mcast_group_bundle *b;
struct xbundle *mcast_xbundle;
LIST_FOR_EACH(b, bundle_node, &grp->bundle_lru) {
mcast_xbundle = xbundle_lookup(ctx->xcfg, b->port);
if (mcast_xbundle && mcast_xbundle != in_xbundle) {
xlate_report(ctx, OFT_DETAIL, "forwarding to mcast group port");
mcast_output_add(out, mcast_xbundle);
} else if (!mcast_xbundle) {
xlate_report(ctx, OFT_WARN,
"mcast group port is unknown, dropping");
} else {
xlate_report(ctx, OFT_DETAIL,
"mcast group port is input port, dropping");
}
}
}
/* send the packet to ports connected to multicast routers */
static void
xlate_normal_mcast_send_mrouters(struct xlate_ctx *ctx,
struct mcast_snooping *ms,
struct xbundle *in_xbundle,
const struct xvlan *xvlan,
struct mcast_output *out)
OVS_REQ_RDLOCK(ms->rwlock)
{
struct mcast_mrouter_bundle *mrouter;
struct xbundle *mcast_xbundle;
LIST_FOR_EACH(mrouter, mrouter_node, &ms->mrouter_lru) {
mcast_xbundle = xbundle_lookup(ctx->xcfg, mrouter->port);
if (mcast_xbundle && mcast_xbundle != in_xbundle
&& mrouter->vlan == xvlan->v[0].vid) {
xlate_report(ctx, OFT_DETAIL, "forwarding to mcast router port");
mcast_output_add(out, mcast_xbundle);
} else if (!mcast_xbundle) {
xlate_report(ctx, OFT_WARN,
"mcast router port is unknown, dropping");
} else if (mrouter->vlan != xvlan->v[0].vid) {
xlate_report(ctx, OFT_DETAIL,
"mcast router is on another vlan, dropping");
} else {
xlate_report(ctx, OFT_DETAIL,
"mcast router port is input port, dropping");
}
}
}
/* send the packet to ports flagged to be flooded */
static void
xlate_normal_mcast_send_fports(struct xlate_ctx *ctx,
struct mcast_snooping *ms,
struct xbundle *in_xbundle,
struct mcast_output *out)
OVS_REQ_RDLOCK(ms->rwlock)
{
struct mcast_port_bundle *fport;
struct xbundle *mcast_xbundle;
LIST_FOR_EACH(fport, node, &ms->fport_list) {
mcast_xbundle = xbundle_lookup(ctx->xcfg, fport->port);
if (mcast_xbundle && mcast_xbundle != in_xbundle) {
xlate_report(ctx, OFT_DETAIL, "forwarding to mcast flood port");
mcast_output_add(out, mcast_xbundle);
} else if (!mcast_xbundle) {
xlate_report(ctx, OFT_WARN,
"mcast flood port is unknown, dropping");
} else {
xlate_report(ctx, OFT_DETAIL,
"mcast flood port is input port, dropping");
}
}
}
/* forward the Reports to configured ports */
static void
xlate_normal_mcast_send_rports(struct xlate_ctx *ctx,
struct mcast_snooping *ms,
struct xbundle *in_xbundle,
struct mcast_output *out)
OVS_REQ_RDLOCK(ms->rwlock)
{
struct mcast_port_bundle *rport;
struct xbundle *mcast_xbundle;
LIST_FOR_EACH(rport, node, &ms->rport_list) {
mcast_xbundle = xbundle_lookup(ctx->xcfg, rport->port);
ofproto: Fix wrong datapath flow with same in_port and output port. In my test, the new datapath flow which has the same in_port and actions output port was found using ovs-appctl dpctl/dump-flows. Then the mac address will move from one port to another and back it again in the physical switch. This problem result in the VM's traffic become abnormal. My test key steps: 1) There are three VM using ovs bridge and intel 82599 nics as uplink port, deployed in different hosts connecting to the same physical switch. They can be named using VM-A, VM-B and VM-C, Host-A, Host-B, Host-C. 2) VM-A send many unicast packets to VM-B, and VM-B also send unicast packets to VM-A. 3) VM-C ping VM-A continuously, and do ovs port add/delete testing in Host-C ovs bridge. 4) In some abormal scence, the physical switch clear all the mac-entry on each ports. Then Host-C ovs bridge's uplink port will receive two direction packets(VM-A to VM-B, and VM-B to VM-A). The expected result is that this two direction packets should be droppd in the uplink port. Because the dst port of this packets is the uplink port which is also the src port by looking ovs bridge's mac-entry table learned by ovs NORMAL rules. But the truth is some packets being sent back to uplink port and physical switch. And then VM-A's mac was moved to the physical switch port of Host-C from the port of Host-A, as a reulst, VM-C ping VM-A failed at this time. When this problem occurs, the abnormal ovs datapath's flow "in_port(2) actions:2" was found by executing the command "ovs-appctl dpctl/dump-flows". Currently, xlate_normal() uses xbundle pointer compare to verify the packet's dst port whether is same with its input port. This implemention may be wrong while calling xlate_txn_start/xlate_txn_commit in type_run() at the same time, because xcfg/xbridge/xbundle object was reallocated and copied just before we lookup the dst mac_port and mac_xbundle. Then mac_xbundle and in_xbundle are same related with the uplink port but not same object pointer. And we can fix this bug by adding ofbundle check conditions shown in my patch. Signed-off-by: Lilijun <jerry.lilijun@huawei.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-01-19 08:12:30 +00:00
if (mcast_xbundle
&& mcast_xbundle != in_xbundle
&& mcast_xbundle->ofbundle != in_xbundle->ofbundle) {
xlate_report(ctx, OFT_DETAIL,
"forwarding report to mcast flagged port");
mcast_output_add(out, mcast_xbundle);
} else if (!mcast_xbundle) {
xlate_report(ctx, OFT_WARN,
"mcast port is unknown, dropping the report");
} else {
xlate_report(ctx, OFT_DETAIL,
"mcast port is input port, dropping the Report");
}
}
}
static void
xlate_normal_flood(struct xlate_ctx *ctx, struct xbundle *in_xbundle,
struct xvlan *xvlan)
{
struct xbundle *xbundle;
LIST_FOR_EACH (xbundle, list_node, &ctx->xbridge->xbundles) {
if (xbundle != in_xbundle
ofproto: Fix wrong datapath flow with same in_port and output port. In my test, the new datapath flow which has the same in_port and actions output port was found using ovs-appctl dpctl/dump-flows. Then the mac address will move from one port to another and back it again in the physical switch. This problem result in the VM's traffic become abnormal. My test key steps: 1) There are three VM using ovs bridge and intel 82599 nics as uplink port, deployed in different hosts connecting to the same physical switch. They can be named using VM-A, VM-B and VM-C, Host-A, Host-B, Host-C. 2) VM-A send many unicast packets to VM-B, and VM-B also send unicast packets to VM-A. 3) VM-C ping VM-A continuously, and do ovs port add/delete testing in Host-C ovs bridge. 4) In some abormal scence, the physical switch clear all the mac-entry on each ports. Then Host-C ovs bridge's uplink port will receive two direction packets(VM-A to VM-B, and VM-B to VM-A). The expected result is that this two direction packets should be droppd in the uplink port. Because the dst port of this packets is the uplink port which is also the src port by looking ovs bridge's mac-entry table learned by ovs NORMAL rules. But the truth is some packets being sent back to uplink port and physical switch. And then VM-A's mac was moved to the physical switch port of Host-C from the port of Host-A, as a reulst, VM-C ping VM-A failed at this time. When this problem occurs, the abnormal ovs datapath's flow "in_port(2) actions:2" was found by executing the command "ovs-appctl dpctl/dump-flows". Currently, xlate_normal() uses xbundle pointer compare to verify the packet's dst port whether is same with its input port. This implemention may be wrong while calling xlate_txn_start/xlate_txn_commit in type_run() at the same time, because xcfg/xbridge/xbundle object was reallocated and copied just before we lookup the dst mac_port and mac_xbundle. Then mac_xbundle and in_xbundle are same related with the uplink port but not same object pointer. And we can fix this bug by adding ofbundle check conditions shown in my patch. Signed-off-by: Lilijun <jerry.lilijun@huawei.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-01-19 08:12:30 +00:00
&& xbundle->ofbundle != in_xbundle->ofbundle
&& xbundle_includes_vlan(xbundle, xvlan)
&& xbundle->floodable
&& !xbundle_mirror_out(ctx->xbridge, xbundle)) {
output_normal(ctx, xbundle, xvlan);
}
}
ctx->nf_output_iface = NF_OUT_FLOOD;
}
static bool
is_ip_local_multicast(const struct flow *flow, struct flow_wildcards *wc)
{
if (flow->dl_type == htons(ETH_TYPE_IP)) {
memset(&wc->masks.nw_dst, 0xff, sizeof wc->masks.nw_dst);
return ip_is_local_multicast(flow->nw_dst);
} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
memset(&wc->masks.ipv6_dst, 0xff, sizeof wc->masks.ipv6_dst);
return ipv6_is_all_hosts(&flow->ipv6_dst);
} else {
return false;
}
}
static void
xlate_normal(struct xlate_ctx *ctx)
{
struct flow_wildcards *wc = ctx->wc;
struct flow *flow = &ctx->xin->flow;
struct xbundle *in_xbundle;
struct xport *in_port;
struct mac_entry *mac;
void *mac_port;
struct xvlan in_xvlan;
struct xvlan xvlan;
uint16_t vlan;
memset(&wc->masks.dl_src, 0xff, sizeof wc->masks.dl_src);
memset(&wc->masks.dl_dst, 0xff, sizeof wc->masks.dl_dst);
wc->masks.vlans[0].tci |= htons(VLAN_VID_MASK | VLAN_CFI);
in_xbundle = lookup_input_bundle(ctx, flow->in_port.ofp_port, &in_port);
if (!in_xbundle) {
xlate_report(ctx, OFT_WARN, "no input bundle, dropping");
return;
}
/* Drop malformed frames. */
if (eth_type_vlan(flow->dl_type) &&
!(flow->vlans[0].tci & htons(VLAN_CFI))) {
if (ctx->xin->packet != NULL) {
xlate_report_error(ctx, "dropping packet with partial "
"VLAN tag received on port %s",
in_xbundle->name);
}
xlate_report(ctx, OFT_WARN, "partial VLAN tag, dropping");
return;
}
/* Drop frames on bundles reserved for mirroring. */
if (xbundle_mirror_out(ctx->xbridge, in_xbundle)) {
if (ctx->xin->packet != NULL) {
xlate_report_error(ctx, "dropping packet received on port %s, "
"which is reserved exclusively for mirroring",
in_xbundle->name);
}
xlate_report(ctx, OFT_WARN,
"input port is mirror output port, dropping");
return;
}
/* Check VLAN. */
xvlan_extract(flow, &in_xvlan);
if (!input_vid_is_valid(ctx, in_xvlan.v[0].vid, in_xbundle)) {
xlate_report(ctx, OFT_WARN,
"disallowed VLAN VID for this input port, dropping");
return;
}
xvlan_input_translate(in_xbundle, &in_xvlan, &xvlan);
vlan = xvlan.v[0].vid;
/* Check other admissibility requirements. */
if (in_port && !is_admissible(ctx, in_port, vlan)) {
return;
}
/* Learn source MAC. */
bool is_grat_arp = is_gratuitous_arp(flow, wc);
if (ctx->xin->allow_side_effects
&& flow->packet_type == htonl(PT_ETH)
&& in_port && in_port->pt_mode != NETDEV_PT_LEGACY_L3
) {
update_learning_table(ctx, in_xbundle, flow->dl_src, vlan,
is_grat_arp);
}
if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) {
struct xc_entry *entry;
/* Save just enough info to update mac learning table later. */
ofproto: Add refcount to ofproto to fix ofproto use-after-free. From hepeng: https://patchwork.ozlabs.org/project/openvswitch/patch/20200717015041.82746-1-hepeng.0320@bytedance.com/#2487473 also from guohongzhi <guohongzhi1@huawei.com>: http://patchwork.ozlabs.org/project/openvswitch/patch/20200306130555.19884-1-guohongzhi1@huawei.com/ also from a discussion about the mixing use of RCU and refcount in the mail list with Ilya Maximets, William Tu, Ben Pfaf, and Gaëtan Rivet. A summary, as quoted from Ilya: " RCU for ofproto was introduced for one and only one reason - to avoid freeing ofproto while rules are still alive. This was done in commit f416c8d61601 ("ofproto: RCU postpone rule destruction."). The goal was to allow using rules without refcounting them within a single grace period. And that forced us to postpone destruction of the ofproto for a single grace period. Later commit 39c9459355b6 ("Use classifier versioning.") made it possible for rules to be alive for more than one grace period, so the commit made ofproto wait for 2 grace periods by double postponing. As we can see now, that wasn't enough and we have to wait for more than 2 grace periods in certain cases. " In a short, the ofproto should have a longer life time than rule, if the rule lasts for more than 2 grace periods, the ofproto should live longer to ensure rule->ofproto is valid. It's hard to predict how long a ofproto should live, thus we need to use refcount on ofproto to make things easy. The controversial part is that we have already used RCU postpone to delay ofproto destrution, if we have to add refcount, is it simpler to use just refcount without RCU postpone? IMO, I think going back to the pure refcount solution is more complicated than mixing using both. Gaëtan Rive asks some questions on guohongzhi's v2 patch: during ofproto_rule_create, should we use ofproto_ref or ofproto_try_ref? how can we make sure the ofproto is alive? By using RCU, ofproto has three states: state 1: alive, with refcount >= 1 state 2: dying, with refcount == 0, however pointer is valid state 3: died, memory freed, pointer might be dangling. Without using RCU, there is no state 2, thus, we have to be very careful every time we see a ofproto pointer. In contrast, with RCU, we can be sure that it's alive at least in this grace peroid, so we can just check if it is dying by ofproto_try_ref. This shows that by mixing use of RCU and refcount we can save a lot of work worrying if ofproto is dangling. In short, the RCU part makes sure the ofproto is alive when we use it, and the refcount part makes sure it lives longer enough. In this patch, I have merged guohongzhi's patch and mine, and fixes accoring to the previous comments. Acked-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Gaetan Rivet <grive@u256.net> Acked-by: Mike Pattrick <mkp@redhat.com> Acked-by: Alin-Gabriel Serdean <aserdean@ovn.org> Tested-by: Alin-Gabriel Serdean <aserdean@ovn.org> Signed-off-by: Peng He <hepeng.0320@bytedance.com> Co-authored-by: Hongzhi Guo <guohongzhi1@huawei.com> Signed-off-by: Hongzhi Guo <guohongzhi1@huawei.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2022-02-19 03:26:07 +00:00
if (ofproto_try_ref(&ctx->xbridge->ofproto->up)) {
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NORMAL);
entry->normal.ofproto = ctx->xbridge->ofproto;
entry->normal.in_port = flow->in_port.ofp_port;
entry->normal.dl_src = flow->dl_src;
entry->normal.vlan = vlan;
entry->normal.is_gratuitous_arp = is_grat_arp;
}
}
/* Determine output bundle. */
if (mcast_snooping_enabled(ctx->xbridge->ms)
&& !eth_addr_is_broadcast(flow->dl_dst)
&& eth_addr_is_multicast(flow->dl_dst)
&& is_ip_any(flow)) {
struct mcast_snooping *ms = ctx->xbridge->ms;
struct mcast_group *grp = NULL;
if (is_igmp(flow, wc)) {
/*
* IGMP packets need to take the slow path, in order to be
* processed for mdb updates. That will prevent expires
* firing off even after hosts have sent reports.
*/
ctx->xout->slow |= SLOW_ACTION;
if (mcast_snooping_is_membership(flow->tp_src) ||
mcast_snooping_is_query(flow->tp_src)) {
if (ctx->xin->allow_side_effects && ctx->xin->packet) {
update_mcast_snooping_table(ctx, flow, vlan,
in_xbundle, ctx->xin->packet);
}
}
if (mcast_snooping_is_membership(flow->tp_src)) {
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
/* RFC4541: section 2.1.1, item 1: A snooping switch should
* forward IGMP Membership Reports only to those ports where
* multicast routers are attached. Alternatively stated: a
* snooping switch should not forward IGMP Membership Reports
* to ports on which only hosts are attached.
* An administrative control may be provided to override this
* restriction, allowing the report messages to be flooded to
* other ports. */
xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
xlate_report(ctx, OFT_DETAIL, "multicast traffic, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
return;
} else if (is_mld(flow, wc)) {
ctx->xout->slow |= SLOW_ACTION;
if (ctx->xin->allow_side_effects && ctx->xin->packet) {
update_mcast_snooping_table(ctx, flow, vlan,
in_xbundle, ctx->xin->packet);
}
if (is_mld_report(flow, wc)) {
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
xlate_report(ctx, OFT_DETAIL, "MLD query, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
return;
} else {
if (is_ip_local_multicast(flow, wc)) {
/* RFC4541: section 2.1.2, item 2: Packets with a dst IP
* address in the 224.0.0.x range which are not IGMP must
* be forwarded on all ports */
xlate_report(ctx, OFT_DETAIL,
"RFC4541: section 2.1.2, item 2, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
return;
}
}
/* forwarding to group base ports */
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
if (flow->dl_type == htons(ETH_TYPE_IP)) {
grp = mcast_snooping_lookup4(ms, flow->nw_dst, vlan);
} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
grp = mcast_snooping_lookup(ms, &flow->ipv6_dst, vlan);
}
if (grp) {
xlate_normal_mcast_send_group(ctx, ms, grp, in_xbundle, &out);
xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
} else {
if (mcast_snooping_flood_unreg(ms)) {
xlate_report(ctx, OFT_DETAIL,
"unregistered multicast, flooding");
out.flood = true;
} else {
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
}
}
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
ovs_rwlock_rdlock(&ctx->xbridge->ml->rwlock);
mac = mac_learning_lookup(ctx->xbridge->ml, flow->dl_dst, vlan);
mac-learning: Implement per-port MAC learning fairness. In "MAC flooding", an attacker transmits an overwhelming number of frames with unique Ethernet source address on a switch port. The goal is to force the switch to evict all useful MAC learning table entries, so that its behavior degenerates to that of a hub, flooding all traffic. In turn, that allows an attacker to eavesdrop on the traffic of other hosts attached to the switch, with all the risks that that entails. Before this commit, the Open vSwitch "normal" action that implements its standalone switch behavior (and that can be used by OpenFlow controllers as well) was vulnerable to MAC flooding attacks. This commit fixes the problem by implementing per-port fairness for MAC table entries: when the MAC table is at its maximum size, MAC table eviction always deletes an entry from the port with the most entries. Thus, MAC entries will never be evicted from ports with only a few entries if a port with a huge number of entries exists. Controllers could introduce their own MAC flooding vulnerabilities into OVS. For a controller that adds destination MAC based flows to an OpenFlow flow table as a reaction to "packet-in" events, such a bug, if it exists, would be in the controller code itself and would need to be fixed in the controller. For a controller that relies on the Open vSwitch "learn" action to add destination MAC based flows, Open vSwitch has existing support for eviction policy similar to that implemented in this commit through the "groups" column in the Flow_Table table documented in ovs-vswitchd.conf.db(5); we recommend that users of "learn" not already familiar with eviction groups to read that documentation. In addition to implementation of per-port MAC learning fairness, this commit includes some closely related changes: - Access to client-provided "port" data in struct mac_entry is now abstracted through helper functions, which makes it easier to ensure that the per-port data structures are maintained consistently. - The mac_learning_changed() function, which had become trivial, vestigial, and confusing, was removed. Its functionality was folded into the new function mac_entry_set_port(). - Many comments were added and improved; there had been a lot of comment rot in previous versions. CERT: VU#784996 Reported-by: "Ronny L. Bull - bullrl" <bullrl@clarkson.edu> Reported-at: http://www.irongeek.com/i.php?page=videos/derbycon4/t314-exploring-layer-2-network-security-in-virtualized-environments-ronny-l-bull-dr-jeanna-n-matthews Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Ethan Jackson <ethan@nicira.com>
2015-02-11 23:34:50 -08:00
mac_port = mac ? mac_entry_get_port(ctx->xbridge->ml, mac) : NULL;
ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock);
if (mac_port) {
struct xbundle *mac_xbundle = xbundle_lookup(ctx->xcfg, mac_port);
if (mac_xbundle && xbundle_mirror_out(ctx->xbridge, mac_xbundle)) {
xlate_report(ctx, OFT_WARN,
"learned port is a mirror port, dropping");
return;
}
ofproto: Fix wrong datapath flow with same in_port and output port. In my test, the new datapath flow which has the same in_port and actions output port was found using ovs-appctl dpctl/dump-flows. Then the mac address will move from one port to another and back it again in the physical switch. This problem result in the VM's traffic become abnormal. My test key steps: 1) There are three VM using ovs bridge and intel 82599 nics as uplink port, deployed in different hosts connecting to the same physical switch. They can be named using VM-A, VM-B and VM-C, Host-A, Host-B, Host-C. 2) VM-A send many unicast packets to VM-B, and VM-B also send unicast packets to VM-A. 3) VM-C ping VM-A continuously, and do ovs port add/delete testing in Host-C ovs bridge. 4) In some abormal scence, the physical switch clear all the mac-entry on each ports. Then Host-C ovs bridge's uplink port will receive two direction packets(VM-A to VM-B, and VM-B to VM-A). The expected result is that this two direction packets should be droppd in the uplink port. Because the dst port of this packets is the uplink port which is also the src port by looking ovs bridge's mac-entry table learned by ovs NORMAL rules. But the truth is some packets being sent back to uplink port and physical switch. And then VM-A's mac was moved to the physical switch port of Host-C from the port of Host-A, as a reulst, VM-C ping VM-A failed at this time. When this problem occurs, the abnormal ovs datapath's flow "in_port(2) actions:2" was found by executing the command "ovs-appctl dpctl/dump-flows". Currently, xlate_normal() uses xbundle pointer compare to verify the packet's dst port whether is same with its input port. This implemention may be wrong while calling xlate_txn_start/xlate_txn_commit in type_run() at the same time, because xcfg/xbridge/xbundle object was reallocated and copied just before we lookup the dst mac_port and mac_xbundle. Then mac_xbundle and in_xbundle are same related with the uplink port but not same object pointer. And we can fix this bug by adding ofbundle check conditions shown in my patch. Signed-off-by: Lilijun <jerry.lilijun@huawei.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-01-19 08:12:30 +00:00
if (mac_xbundle
&& mac_xbundle != in_xbundle
&& mac_xbundle->ofbundle != in_xbundle->ofbundle) {
xlate_report(ctx, OFT_DETAIL, "forwarding to learned port");
output_normal(ctx, mac_xbundle, &xvlan);
} else if (!mac_xbundle) {
xlate_report(ctx, OFT_WARN,
"learned port is unknown, dropping");
} else {
xlate_report(ctx, OFT_DETAIL,
"learned port is input port, dropping");
}
} else {
xlate_report(ctx, OFT_DETAIL,
"no learned MAC for destination, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
}
}
/* Appends a "sample" action for sFlow or IPFIX to 'ctx->odp_actions'. The
* 'probability' is the number of packets out of UINT32_MAX to sample. The
* 'cookie' is passed back in the callback for each sampled packet.
* 'tunnel_out_port', if not ODPP_NONE, is added as the
* OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute. If 'include_actions',
* an OVS_USERSPACE_ATTR_ACTIONS attribute is added. If
* 'emit_set_tunnel', sample(sampling_port=1) would translate into
* datapath sample action set(tunnel(...)), sample(...) and it is used
* for sampling egress tunnel information.
*/
static size_t
compose_sample_action(struct xlate_ctx *ctx,
const uint32_t probability,
const struct user_action_cookie *cookie,
Extend sFlow agent to report tunnel and MPLS structures Packets are still sampled at ingress only, so the egress tunnel and/or MPLS structures are only included when there is just 1 output port. The actions are either provided by the datapath in the sample upcall or looked up in the userspace cache. The former is preferred because it is more reliable and does not present any new demands or constraints on the userspace cache, however the code falls back on the userspace lookup so that this solution can work with existing kernel datapath modules. If the lookup fails it is not critical: the compiled user-action-cookie is still available and provides the essential output port and output VLAN forwarding information just as before. The openvswitch actions can express almost any tunneling/mangling so the only totally faithful representation would be to somehow encode the whole list of flow actions in the sFlow output. However the standard sFlow tunnel structures can express most common real-world scenarios, so in parsing the actions we look for those and skip the encoding if we see anything unusual. For example, a single set(tunnel()) or tnl_push() is interpreted, but if a second such action is encountered then the egress tunnel reporting is suppressed. The sFlow standard allows "best effort" encoding so that if a field is not knowable or too onerous to look up then it can be left out. This is often the case for the layer-4 source port or even the src ip address of a tunnel. The assumption is that monitoring is enabled everywhere so a missing field can typically be seen at ingress to the next switch in the path. This patch also adds unit tests to check the sFlow encoding of set(tunnel()), tnl_push() and push_mpls() actions. The netlink attribute to request that actions be included in the upcall from the datapath is inserted for sFlow sampling only. To make that option be explicit would require further changes to the printing and parsing of actions in lib/odp-util.c, and to scripts in the test suite. Further enhancements to report on 802.1AD QinQ, 64-bit tunnel IDs, and NAT transformations can follow in future patches that make only incremental changes. Signed-off-by: Neil McKee <neil.mckee@inmon.com> [blp@nicira.com made stylistic and semantic changes] Signed-off-by: Ben Pfaff <blp@nicira.com>
2015-07-17 21:37:02 -07:00
const odp_port_t tunnel_out_port,
bool include_actions)
{
if (probability == 0) {
/* No need to generate sampling or the inner action. */
return 0;
}
/* If the slow path meter is configured by the controller,
* insert a meter action before the user space action. */
struct ofproto *ofproto = &ctx->xin->ofproto->up;
uint32_t meter_id = ofproto->slowpath_meter_id;
/* When meter action is not required, avoid generate sample action
* for 100% sampling rate. */
bool is_sample = probability < UINT32_MAX || meter_id != UINT32_MAX;
size_t sample_offset = 0, actions_offset = 0;
if (is_sample) {
sample_offset = nl_msg_start_nested(ctx->odp_actions,
OVS_ACTION_ATTR_SAMPLE);
nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY,
probability);
actions_offset = nl_msg_start_nested(ctx->odp_actions,
OVS_SAMPLE_ATTR_ACTIONS);
}
if (meter_id != UINT32_MAX) {
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id);
}
odp_port_t odp_port = ofp_port_to_odp_port(
ctx->xbridge, ctx->xin->flow.in_port.ofp_port);
uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port);
size_t cookie_offset;
int res = odp_put_userspace_action(pid, cookie, sizeof *cookie,
tunnel_out_port, include_actions,
ctx->odp_actions, &cookie_offset);
ovs_assert(res == 0);
if (is_sample) {
nl_msg_end_nested(ctx->odp_actions, actions_offset);
nl_msg_end_nested(ctx->odp_actions, sample_offset);
}
return cookie_offset;
}
/* If sFLow is not enabled, returns 0 without doing anything.
*
* If sFlow is enabled, appends a template "sample" action to the ODP actions
* in 'ctx'. This action is a template because some of the information needed
* to fill it out is not available until flow translation is complete. In this
* case, this functions returns an offset, which is always nonzero, to pass
* later to fix_sflow_action() to fill in the rest of the template. */
static size_t
compose_sflow_action(struct xlate_ctx *ctx)
{
struct dpif_sflow *sflow = ctx->xbridge->sflow;
if (!sflow || ctx->xin->flow.in_port.ofp_port == OFPP_NONE) {
return 0;
}
ofproto-dpif: Fix using uninitialised memory in user_action_cookie. Designated initializers are not suitable for initializing non-packed structures and unions which are subjects for comparison by memcmp(). Whole memory for 'struct user_action_cookie' must be explicitly cleared before using because it will be copied with memcpy and later compared by memcmp in ofpbuf_equal(). Few issues found be valgrind: Thread 13 revalidator11: Conditional jump or move depends on uninitialised value(s) at 0x4C35D96: __memcmp_sse4_1 (in vgpreload_memcheck.so) by 0x9D4404: ofpbuf_equal (ofpbuf.h:273) by 0x9D4404: revalidate_ukey__ (ofproto-dpif-upcall.c:2219) by 0x9D4404: revalidate_ukey (ofproto-dpif-upcall.c:2286) by 0x9D62AC: revalidate (ofproto-dpif-upcall.c:2685) by 0x9D62AC: udpif_revalidator (ofproto-dpif-upcall.c:942) by 0xA9C732: ovsthread_wrapper (ovs-thread.c:383) by 0x5FF86DA: start_thread (pthread_create.c:463) by 0x6AF488E: clone (clone.S:95) Uninitialised value was created by a stack allocation at 0x9D4450: compose_slow_path (ofproto-dpif-upcall.c:1062) Thread 11 revalidator16: Conditional jump or move depends on uninitialised value(s) at 0x4C35D96: __memcmp_sse4_1 (in vgpreload_memcheck.so) by 0x9D4404: ofpbuf_equal (ofpbuf.h:273) by 0x9D4404: revalidate_ukey__ (ofproto-dpif-upcall.c:2220) by 0x9D4404: revalidate_ukey (ofproto-dpif-upcall.c:2287) by 0x9D62BC: revalidate (ofproto-dpif-upcall.c:2686) by 0x9D62BC: udpif_revalidator (ofproto-dpif-upcall.c:942) by 0xA9C6D2: ovsthread_wrapper (ovs-thread.c:383) by 0x5FF86DA: start_thread (pthread_create.c:463) by 0x6AF488E: clone (clone.S:95) Uninitialised value was created by a stack allocation at 0x9DC4E0: compose_sflow_action (ofproto-dpif-xlate.c:3211) The struct was never marked as 'packed', however it was manually adjusted to be so in practice. Old IPFIX related commit first made the structure non-contiguous. Commit 8de6ff3ea864 ("ofproto-dpif: Use a fixed size userspace cookie.") added uninitialized parts of the additional union space and the next one introduced new holes between structure fields for all cases. CC: Justin Pettit <jpettit@ovn.org> Fixes: 8b7ea2d48033 ("Extend OVS IPFIX exporter to export tunnel headers") Fixes: 8de6ff3ea864 ("ofproto-dpif: Use a fixed size userspace cookie.") Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user action cookie.") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Acked-by: Ben Pfaff <blp@ovn.org>
2019-07-25 18:11:13 +03:00
struct user_action_cookie cookie;
memset(&cookie, 0, sizeof cookie);
cookie.type = USER_ACTION_COOKIE_SFLOW;
cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port;
cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid;
return compose_sample_action(ctx, dpif_sflow_get_probability(sflow),
&cookie, ODPP_NONE, true);
}
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
/* If flow IPFIX is enabled, make sure IPFIX flow sample action
* at egress point of tunnel port is just in front of corresponding
* output action. If bridge IPFIX is enabled, this appends an IPFIX
* sample action to 'ctx->odp_actions'. */
static void
compose_ipfix_action(struct xlate_ctx *ctx, odp_port_t output_odp_port)
{
struct dpif_ipfix *ipfix = ctx->xbridge->ipfix;
odp_port_t tunnel_out_port = ODPP_NONE;
if (!ipfix ||
(output_odp_port == ODPP_NONE &&
ctx->xin->flow.in_port.ofp_port == OFPP_NONE)) {
return;
}
/* For input case, output_odp_port is ODPP_NONE, which is an invalid port
* number. */
if (output_odp_port == ODPP_NONE &&
!dpif_ipfix_get_bridge_exporter_input_sampling(ipfix)) {
return;
}
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
/* For output case, output_odp_port is valid. */
if (output_odp_port != ODPP_NONE) {
if (!dpif_ipfix_get_bridge_exporter_output_sampling(ipfix)) {
return;
}
/* If tunnel sampling is enabled, put an additional option attribute:
* OVS_USERSPACE_ATTR_TUNNEL_OUT_PORT
*/
if (dpif_ipfix_get_bridge_exporter_tunnel_sampling(ipfix) &&
dpif_ipfix_is_tunnel_port(ipfix, output_odp_port) ) {
tunnel_out_port = output_odp_port;
}
}
ofproto-dpif: Fix using uninitialised memory in user_action_cookie. Designated initializers are not suitable for initializing non-packed structures and unions which are subjects for comparison by memcmp(). Whole memory for 'struct user_action_cookie' must be explicitly cleared before using because it will be copied with memcpy and later compared by memcmp in ofpbuf_equal(). Few issues found be valgrind: Thread 13 revalidator11: Conditional jump or move depends on uninitialised value(s) at 0x4C35D96: __memcmp_sse4_1 (in vgpreload_memcheck.so) by 0x9D4404: ofpbuf_equal (ofpbuf.h:273) by 0x9D4404: revalidate_ukey__ (ofproto-dpif-upcall.c:2219) by 0x9D4404: revalidate_ukey (ofproto-dpif-upcall.c:2286) by 0x9D62AC: revalidate (ofproto-dpif-upcall.c:2685) by 0x9D62AC: udpif_revalidator (ofproto-dpif-upcall.c:942) by 0xA9C732: ovsthread_wrapper (ovs-thread.c:383) by 0x5FF86DA: start_thread (pthread_create.c:463) by 0x6AF488E: clone (clone.S:95) Uninitialised value was created by a stack allocation at 0x9D4450: compose_slow_path (ofproto-dpif-upcall.c:1062) Thread 11 revalidator16: Conditional jump or move depends on uninitialised value(s) at 0x4C35D96: __memcmp_sse4_1 (in vgpreload_memcheck.so) by 0x9D4404: ofpbuf_equal (ofpbuf.h:273) by 0x9D4404: revalidate_ukey__ (ofproto-dpif-upcall.c:2220) by 0x9D4404: revalidate_ukey (ofproto-dpif-upcall.c:2287) by 0x9D62BC: revalidate (ofproto-dpif-upcall.c:2686) by 0x9D62BC: udpif_revalidator (ofproto-dpif-upcall.c:942) by 0xA9C6D2: ovsthread_wrapper (ovs-thread.c:383) by 0x5FF86DA: start_thread (pthread_create.c:463) by 0x6AF488E: clone (clone.S:95) Uninitialised value was created by a stack allocation at 0x9DC4E0: compose_sflow_action (ofproto-dpif-xlate.c:3211) The struct was never marked as 'packed', however it was manually adjusted to be so in practice. Old IPFIX related commit first made the structure non-contiguous. Commit 8de6ff3ea864 ("ofproto-dpif: Use a fixed size userspace cookie.") added uninitialized parts of the additional union space and the next one introduced new holes between structure fields for all cases. CC: Justin Pettit <jpettit@ovn.org> Fixes: 8b7ea2d48033 ("Extend OVS IPFIX exporter to export tunnel headers") Fixes: 8de6ff3ea864 ("ofproto-dpif: Use a fixed size userspace cookie.") Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user action cookie.") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Acked-by: Ben Pfaff <blp@ovn.org>
2019-07-25 18:11:13 +03:00
struct user_action_cookie cookie;
memset(&cookie, 0, sizeof cookie);
cookie.type = USER_ACTION_COOKIE_IPFIX;
cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port;
cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid;
cookie.ipfix.output_odp_port = output_odp_port;
compose_sample_action(ctx,
dpif_ipfix_get_bridge_exporter_probability(ipfix),
&cookie, tunnel_out_port, false);
}
/* Fix "sample" action according to data collected while composing ODP actions,
* as described in compose_sflow_action().
*
* 'user_cookie_offset' must be the offset returned by
* compose_sflow_action(). */
static void
fix_sflow_action(struct xlate_ctx *ctx, unsigned int user_cookie_offset)
{
const struct flow *base = &ctx->base_flow;
struct user_action_cookie *cookie;
cookie = ofpbuf_at(ctx->odp_actions, user_cookie_offset, sizeof *cookie);
ovs_assert(cookie->type == USER_ACTION_COOKIE_SFLOW);
cookie->sflow.vlan_tci = base->vlans[0].tci;
/* See http://www.sflow.org/sflow_version_5.txt (search for "Input/output
* port information") for the interpretation of cookie->output. */
switch (ctx->sflow_n_outputs) {
case 0:
/* 0x40000000 | 256 means "packet dropped for unknown reason". */
cookie->sflow.output = 0x40000000 | 256;
break;
case 1:
cookie->sflow.output = dpif_sflow_odp_port_to_ifindex(
ctx->xbridge->sflow, ctx->sflow_odp_port);
if (cookie->sflow.output) {
break;
}
/* Fall through. */
default:
/* 0x80000000 means "multiple output ports. */
cookie->sflow.output = 0x80000000 | ctx->sflow_n_outputs;
break;
}
}
static bool
process_special(struct xlate_ctx *ctx, const struct xport *xport)
{
const struct flow *flow = &ctx->xin->flow;
struct flow_wildcards *wc = ctx->wc;
const struct xbridge *xbridge = ctx->xbridge;
const struct dp_packet *packet = ctx->xin->packet;
enum slow_path_reason slow;
bool lacp_may_enable;
if (!xport) {
slow = 0;
} else if (xport->cfm && cfm_should_process_flow(xport->cfm, flow, wc)) {
if (packet) {
cfm_process_heartbeat(xport->cfm, packet);
}
slow = SLOW_CFM;
} else if (xport->bfd && bfd_should_process_flow(xport->bfd, flow, wc)) {
if (packet) {
bfd_process_packet(xport->bfd, flow, packet);
/* If POLL received, immediately sends FINAL back. */
if (bfd_should_send_packet(xport->bfd)) {
ofproto_dpif_monitor_port_send_soon(xport->ofport);
}
}
slow = SLOW_BFD;
} else if (xport->xbundle && xport->xbundle->lacp
&& flow->dl_type == htons(ETH_TYPE_LACP)) {
if (packet) {
lacp_may_enable = lacp_process_packet(xport->xbundle->lacp,
xport->ofport, packet);
/* Update LACP status in bond-member to avoid packet-drops
* until LACP state machine is run by the main thread. */
if (xport->xbundle->bond && lacp_may_enable) {
bond_member_set_may_enable(xport->xbundle->bond, xport->ofport,
lacp_may_enable);
}
}
slow = SLOW_LACP;
} else if ((xbridge->stp || xbridge->rstp) &&
stp_should_process_flow(flow, wc)) {
if (packet) {
xbridge->stp
? stp_process_packet(xport, packet)
: rstp_process_packet(xport, packet);
}
slow = SLOW_STP;
} else if (xport->lldp && lldp_should_process_flow(xport->lldp, flow)) {
if (packet) {
lldp_process_packet(xport->lldp, packet);
}
slow = SLOW_LLDP;
} else {
slow = 0;
}
if (slow) {
ctx->xout->slow |= slow;
return true;
} else {
return false;
}
}
static int
tnl_route_lookup_flow(const struct xlate_ctx *ctx,
const struct flow *oflow,
struct in6_addr *ip, struct in6_addr *src,
struct xport **out_port)
{
char out_dev[IFNAMSIZ];
struct xbridge *xbridge;
struct in6_addr gw;
struct in6_addr dst;
dst = flow_tnl_dst(&oflow->tunnel);
if (!ovs_router_lookup(oflow->pkt_mark, &dst, out_dev, src, &gw)) {
return -ENOENT;
}
if (ipv6_addr_is_set(&gw) &&
(!IN6_IS_ADDR_V4MAPPED(&gw) || in6_addr_get_mapped_ipv4(&gw))) {
*ip = gw;
} else {
*ip = dst;
}
HMAP_FOR_EACH (xbridge, hmap_node, &ctx->xcfg->xbridges) {
if (!strncmp(xbridge->name, out_dev, IFNAMSIZ)) {
struct xport *port;
HMAP_FOR_EACH (port, ofp_node, &xbridge->xports) {
if (!strncmp(netdev_get_name(port->netdev), out_dev, IFNAMSIZ)) {
*out_port = port;
return 0;
}
}
}
}
/* If tunnel IP isn't configured on bridges, then we search all ports. */
HMAP_FOR_EACH (xbridge, hmap_node, &ctx->xcfg->xbridges) {
struct xport *port;
HMAP_FOR_EACH (port, ofp_node, &xbridge->xports) {
if (!strncmp(netdev_get_name(port->netdev),
out_dev, IFNAMSIZ)) {
*out_port = port;
return 0;
}
}
}
return -ENOENT;
}
static int
compose_table_xlate(struct xlate_ctx *ctx, const struct xport *out_dev,
struct dp_packet *packet)
{
struct xbridge *xbridge = out_dev->xbridge;
ovs_version_t version = ofproto_dpif_get_tables_version(xbridge->ofproto);
struct ofpact_output output;
struct flow flow;
ofpact_init(&output.ofpact, OFPACT_OUTPUT, sizeof output);
flow_extract(packet, &flow);
flow.in_port.ofp_port = out_dev->ofp_port;
output.port = OFPP_TABLE;
output.max_len = 0;
return ofproto_dpif_execute_actions__(xbridge->ofproto, version, &flow,
NULL, &output.ofpact, sizeof output,
ctx->depth, ctx->resubmits, packet);
}
static void
tnl_send_nd_request(struct xlate_ctx *ctx, const struct xport *out_dev,
const struct eth_addr eth_src,
struct in6_addr * ipv6_src, struct in6_addr * ipv6_dst)
{
struct dp_packet packet;
dp_packet_init(&packet, 0);
compose_nd_ns(&packet, eth_src, ipv6_src, ipv6_dst);
compose_table_xlate(ctx, out_dev, &packet);
dp_packet_uninit(&packet);
}
static void
tnl_send_arp_request(struct xlate_ctx *ctx, const struct xport *out_dev,
const struct eth_addr eth_src,
ovs_be32 ip_src, ovs_be32 ip_dst)
{
struct dp_packet packet;
dp_packet_init(&packet, 0);
compose_arp(&packet, ARP_OP_REQUEST,
eth_src, eth_addr_zero, true, ip_src, ip_dst);
compose_table_xlate(ctx, out_dev, &packet);
dp_packet_uninit(&packet);
}
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
static void
propagate_tunnel_data_to_flow__(struct flow *dst_flow,
const struct flow *src_flow,
struct eth_addr dmac, struct eth_addr smac,
struct in6_addr s_ip6, ovs_be32 s_ip,
bool is_tnl_ipv6, uint8_t nw_proto)
{
dst_flow->dl_dst = dmac;
dst_flow->dl_src = smac;
/* Clear VLAN entries which do not apply for tunnel flows. */
memset(dst_flow->vlans, 0, sizeof dst_flow->vlans);
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
dst_flow->packet_type = htonl(PT_ETH);
dst_flow->nw_dst = src_flow->tunnel.ip_dst;
dst_flow->nw_src = src_flow->tunnel.ip_src;
dst_flow->ipv6_dst = src_flow->tunnel.ipv6_dst;
dst_flow->ipv6_src = src_flow->tunnel.ipv6_src;
dst_flow->nw_frag = 0; /* Tunnel packets are unfragmented. */
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
dst_flow->nw_tos = src_flow->tunnel.ip_tos;
dst_flow->nw_ttl = src_flow->tunnel.ip_ttl;
dst_flow->tp_dst = src_flow->tunnel.tp_dst;
dst_flow->tp_src = src_flow->tunnel.tp_src;
if (is_tnl_ipv6) {
dst_flow->dl_type = htons(ETH_TYPE_IPV6);
if (ipv6_mask_is_any(&dst_flow->ipv6_src)
&& !ipv6_mask_is_any(&s_ip6)) {
dst_flow->ipv6_src = s_ip6;
}
} else {
dst_flow->dl_type = htons(ETH_TYPE_IP);
if (dst_flow->nw_src == 0 && s_ip) {
dst_flow->nw_src = s_ip;
}
}
dst_flow->nw_proto = nw_proto;
}
/*
* Populate the 'flow' and 'base_flow' L3 fields to do the post tunnel push
* translations.
*/
static void
propagate_tunnel_data_to_flow(struct xlate_ctx *ctx, struct eth_addr dmac,
struct eth_addr smac, struct in6_addr s_ip6,
ovs_be32 s_ip, bool is_tnl_ipv6,
enum ovs_vport_type tnl_type)
{
struct flow *base_flow, *flow;
flow = &ctx->xin->flow;
base_flow = &ctx->base_flow;
uint8_t nw_proto = 0;
switch (tnl_type) {
case OVS_VPORT_TYPE_GRE:
compat: Add ipv6 GRE and IPV6 Tunneling This patch backports upstream ipv6 GRE and tunneling into the OVS OOT (Out of Tree) datapath drivers. The primary reason for this is to support the ERSPAN feature. Because there is no previous history of ipv6 GRE and tunneling it is not possible to exactly reproduce the history of all the files in the patch. The two newly added files - ip6_gre.c and ip6_tunnel.c - are cut from whole cloth out of the upstream Linux 4.15 kernel and then modified as necessary with compatibility layer fixups. These two files already included parts of several other upstream commits that also touched other upstream files. As such, this patch may incorporate parts or all of the following commits: d350a82 net: erspan: create erspan metadata uapi header c69de58 net: erspan: use bitfield instead of mask and offset b423d13 net: erspan: fix use-after-free 214bb1c net: erspan: remove md NULL check afb4c97 ip6_gre: fix potential memory leak in ip6erspan_rcv 50670b6 ip_gre: fix potential memory leak in erspan_rcv a734321 ip6_gre: fix error path when ip6erspan_rcv failed dd8d5b8 ip_gre: fix error path when erspan_rcv failed 293a199 ip6_gre: fix a pontential issue in ip6erspan_rcv d91e8db5 net: erspan: reload pointer after pskb_may_pull ae3e133 net: erspan: fix wrong return value c05fad5 ip_gre: fix wrong return value of erspan_rcv 94d7d8f ip6_gre: add erspan v2 support f551c91 net: erspan: introduce erspan v2 for ip_gre 1d7e2ed net: erspan: refactor existing erspan code ef7baf5 ip6_gre: add ip6 erspan collect_md mode 5a963eb ip6_gre: Add ERSPAN native tunnel support ceaa001 openvswitch: Add erspan tunnel support. f192970 ip_gre: check packet length and mtu correctly in erspan tx c84bed4 ip_gre: erspan device should keep dst c122fda ip_gre: set tunnel hlen properly in erspan_tunnel_init 5513d08 ip_gre: check packet length and mtu correctly in erspan_xmit 935a974 ip_gre: get key from session_id correctly in erspan_rcv 1a66a83 gre: add collect_md mode to ERSPAN tunnel 84e54fe gre: introduce native tunnel support for ERSPAN In cases where the listed commits also touched other source code files then the patches are also listed separately within this patch series. Signed-off-by: Greg Rose <gvrose8192@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: William Tu <u9012063@gmail.com>
2018-03-05 10:11:57 -08:00
case OVS_VPORT_TYPE_ERSPAN:
case OVS_VPORT_TYPE_IP6ERSPAN:
case OVS_VPORT_TYPE_IP6GRE:
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
nw_proto = IPPROTO_GRE;
break;
case OVS_VPORT_TYPE_VXLAN:
case OVS_VPORT_TYPE_GENEVE:
case OVS_VPORT_TYPE_GTPU:
tunnel: Bareudp Tunnel Support. There are various L3 encapsulation standards using UDP being discussed to leverage the UDP based load balancing capability of different networks. MPLSoUDP (__ https://tools.ietf.org/html/rfc7510) is one among them. The Bareudp tunnel provides a generic L3 encapsulation support for tunnelling different L3 protocols like MPLS, IP, NSH etc. inside a UDP tunnel. An example to create bareudp device to tunnel MPLS traffic is given $ ovs-vsctl add-port br_mpls udp_port -- set interface udp_port \ type=bareudp options:remote_ip=2.1.1.3 options:local_ip=2.1.1.2 \ options:payload_type=0x8847 options:dst_port=6635 The bareudp device supports special handling for MPLS & IP as they can have multiple ethertypes. MPLS procotcol can have ethertypes ETH_P_MPLS_UC (unicast) & ETH_P_MPLS_MC (multicast). IP protocol can have ethertypes ETH_P_IP (v4) & ETH_P_IPV6 (v6). The bareudp device to tunnel L3 traffic with multiple ethertypes (MPLS & IP) can be created by passing the L3 protocol name as string in the field payload_type. An example to create bareudp device to tunnel MPLS unicast & multicast traffic is given below.:: $ ovs-vsctl add-port br_mpls udp_port -- set interface udp_port \ type=bareudp options:remote_ip=2.1.1.3 options:local_ip=2.1.1.2 \ options:payload_type=mpls options:dst_port=6635 Signed-off-by: Martin Varghese <martin.varghese@nokia.com> Acked-By: Greg Rose <gvrose8192@gmail.com> Tested-by: Greg Rose <gvrose8192@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-12-17 12:48:41 +05:30
case OVS_VPORT_TYPE_BAREUDP:
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
nw_proto = IPPROTO_UDP;
break;
case OVS_VPORT_TYPE_SRV6:
nw_proto = (flow->dl_type == htons(ETH_TYPE_IP))
? IPPROTO_IPIP : IPPROTO_IPV6;
break;
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
case OVS_VPORT_TYPE_LISP:
case OVS_VPORT_TYPE_STT:
case OVS_VPORT_TYPE_UNSPEC:
case OVS_VPORT_TYPE_NETDEV:
case OVS_VPORT_TYPE_INTERNAL:
case __OVS_VPORT_TYPE_MAX:
default:
OVS_NOT_REACHED();
}
/*
* Update base_flow first followed by flow as the dst_flow gets modified
* in the function.
*/
propagate_tunnel_data_to_flow__(base_flow, flow, dmac, smac, s_ip6, s_ip,
is_tnl_ipv6, nw_proto);
propagate_tunnel_data_to_flow__(flow, flow, dmac, smac, s_ip6, s_ip,
is_tnl_ipv6, nw_proto);
}
static int
native_tunnel_output(struct xlate_ctx *ctx, const struct xport *xport,
const struct flow *flow, odp_port_t tunnel_odp_port,
bool truncate, bool is_last_action)
{
struct netdev_tnl_build_header_params tnl_params;
struct ovs_action_push_tnl tnl_push_data;
struct xport *out_dev = NULL;
ovs_be32 s_ip = 0, d_ip = 0;
struct in6_addr s_ip6 = in6addr_any;
struct in6_addr d_ip6 = in6addr_any;
struct eth_addr smac;
struct eth_addr dmac;
int err;
char buf_sip6[INET6_ADDRSTRLEN];
char buf_dip6[INET6_ADDRSTRLEN];
/* Store sFlow data. */
uint32_t sflow_n_outputs = ctx->sflow_n_outputs;
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
/* Structures to backup Ethernet and IP of base_flow. */
struct flow old_base_flow;
struct flow old_flow;
/* Backup flow & base_flow data. */
memcpy(&old_base_flow, &ctx->base_flow, sizeof old_base_flow);
memcpy(&old_flow, &ctx->xin->flow, sizeof old_flow);
if (flow->tunnel.ip_src) {
in6_addr_set_mapped_ipv4(&s_ip6, flow->tunnel.ip_src);
}
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
err = tnl_route_lookup_flow(ctx, flow, &d_ip6, &s_ip6, &out_dev);
if (err) {
xlate_report(ctx, OFT_WARN, "native tunnel routing failed");
return err;
}
xlate_report(ctx, OFT_DETAIL, "tunneling to %s via %s",
ipv6_string_mapped(buf_dip6, &d_ip6),
netdev_get_name(out_dev->netdev));
/* Use mac addr of bridge port of the peer. */
err = netdev_get_etheraddr(out_dev->netdev, &smac);
if (err) {
xlate_report(ctx, OFT_WARN,
"tunnel output device lacks Ethernet address");
return err;
}
d_ip = in6_addr_get_mapped_ipv4(&d_ip6);
if (d_ip) {
s_ip = in6_addr_get_mapped_ipv4(&s_ip6);
}
err = tnl_neigh_lookup(out_dev->xbridge->name, &d_ip6, &dmac);
if (err) {
struct in6_addr nh_s_ip6 = in6addr_any;
xlate_report(ctx, OFT_DETAIL,
"neighbor cache miss for %s on bridge %s, "
"sending %s request",
buf_dip6, out_dev->xbridge->name, d_ip ? "ARP" : "ND");
err = ovs_router_get_netdev_source_address(&d_ip6,
out_dev->xbridge->name,
&nh_s_ip6);
if (err) {
nh_s_ip6 = s_ip6;
}
if (d_ip) {
ovs_be32 nh_s_ip;
nh_s_ip = in6_addr_get_mapped_ipv4(&nh_s_ip6);
tnl_send_arp_request(ctx, out_dev, smac, nh_s_ip, d_ip);
} else {
tnl_send_nd_request(ctx, out_dev, smac, &nh_s_ip6, &d_ip6);
}
return err;
}
if (ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_TNL_NEIGH);
ovs_strlcpy(entry->tnl_neigh_cache.br_name, out_dev->xbridge->name,
sizeof entry->tnl_neigh_cache.br_name);
entry->tnl_neigh_cache.d_ipv6 = d_ip6;
}
xlate_report(ctx, OFT_DETAIL, "tunneling from "ETH_ADDR_FMT" %s"
" to "ETH_ADDR_FMT" %s",
ETH_ADDR_ARGS(smac), ipv6_string_mapped(buf_sip6, &s_ip6),
ETH_ADDR_ARGS(dmac), buf_dip6);
netdev_init_tnl_build_header_params(&tnl_params, flow, &s_ip6, dmac, smac);
err = tnl_port_build_header(xport->ofport, &tnl_push_data, &tnl_params);
if (err) {
xlate_report(ctx, OFT_WARN, "native tunnel header build failed");
return err;
}
tnl_push_data.tnl_port = tunnel_odp_port;
tnl_push_data.out_port = out_dev->odp_port;
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
/* After tunnel header has been added, MAC and IP data of flow and
* base_flow need to be set properly, since there is not recirculation
* any more when sending packet to tunnel. */
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
propagate_tunnel_data_to_flow(ctx, dmac, smac, s_ip6,
s_ip, tnl_params.is_ipv6,
tnl_push_data.tnl_type);
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
size_t offset;
size_t push_action_size;
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
offset = is_last_action
? ctx->odp_actions->size
: nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_CLONE);
odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
push_action_size = ctx->odp_actions->size;
if (!truncate) {
const struct dpif_flow_stats *backup_resubmit_stats;
struct xlate_cache *backup_xcache;
struct flow_wildcards *backup_wc, wc;
bool backup_side_effects;
const struct dp_packet *backup_packet;
memset(&wc, 0 , sizeof wc);
backup_wc = ctx->wc;
ctx->wc = &wc;
ctx->xin->wc = NULL;
backup_resubmit_stats = ctx->xin->resubmit_stats;
backup_xcache = ctx->xin->xcache;
backup_side_effects = ctx->xin->allow_side_effects;
backup_packet = ctx->xin->packet;
ctx->xin->resubmit_stats = NULL;
ctx->xin->xcache = xlate_cache_new(); /* Use new temporary cache. */
ctx->xin->allow_side_effects = false;
ctx->xin->packet = NULL;
/* Push the cache entry for the tunnel first. */
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_TUNNEL_HEADER);
entry->tunnel_hdr.hdr_size = tnl_push_data.header_len;
entry->tunnel_hdr.operation = ADD;
patch_port_output(ctx, xport, out_dev, is_last_action);
/* Similar to the stats update in revalidation, the x_cache entries
* are populated by the previous translation are used to update the
* stats correctly.
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
*/
if (backup_resubmit_stats) {
struct dpif_flow_stats stats = *backup_resubmit_stats;
Add offload packets statistics Add argument '--offload-stats' for command ovs-appctl bridge/dump-flows to display the offloaded packets statistics. The commands display as below: orignal command: ovs-appctl bridge/dump-flows br0 duration=574s, n_packets=1152, n_bytes=110768, priority=0,actions=NORMAL table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=2,recirc_id=0,actions=drop table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x1,actions=controller(reason=) table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x2,actions=drop table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x3,actions=drop new command with argument '--offload-stats' Notice: 'n_offload_packets' are a subset of n_packets and 'n_offload_bytes' are a subset of n_bytes. ovs-appctl bridge/dump-flows --offload-stats br0 duration=582s, n_packets=1152, n_bytes=110768, n_offload_packets=1107, n_offload_bytes=107992, priority=0,actions=NORMAL table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=2,recirc_id=0,actions=drop table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x1,actions=controller(reason=) table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x2,actions=drop table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x3,actions=drop Signed-off-by: zhaozhanxu <zhaozhanxu@163.com> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2019-12-05 14:26:25 +08:00
xlate_push_stats(ctx->xin->xcache, &stats, false);
}
xlate_cache_steal_entries(backup_xcache, ctx->xin->xcache);
if (ctx->odp_actions->size > push_action_size) {
if (!is_last_action) {
nl_msg_end_non_empty_nested(ctx->odp_actions, offset);
}
} else {
if (is_last_action) {
/* Reset size since no actions added in patch port output. */
nl_msg_reset_size(ctx->odp_actions, offset);
} else {
/* Cancel nested clone action. */
nl_msg_cancel_nested(ctx->odp_actions, offset);
}
}
/* Restore context status. */
ctx->xin->resubmit_stats = backup_resubmit_stats;
xlate_cache_delete(ctx->xin->xcache);
ctx->xin->xcache = backup_xcache;
ctx->xin->allow_side_effects = backup_side_effects;
ctx->xin->packet = backup_packet;
ctx->wc = backup_wc;
} else {
/* In order to maintain accurate stats, use recirc for
* native tunneling. */
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, 0);
if (!is_last_action) {
nl_msg_end_nested(ctx->odp_actions, offset);
}
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
}
tunneling: Avoid datapath-recirc by combining recirc actions at xlate. This patch set removes the recirculation of encapsulated tunnel packets if possible. It is done by computing the post tunnel actions at the time of translation. The combined nested action set are programmed in the datapath using CLONE action. The following test results shows the performance improvement offered by this optimization for tunnel encap. +-------------+ dpdk0 | | -->o br-in | | o--> gre0 +-------------+ --> LOCAL +-----------o-+ | | dpdk1 | br-p1 o--> | | +-------------+ Test result on OVS master with DPDK 16.11.2 (Without optimization): # dpdk0 RX packets : 7037641.60 / sec RX packet errors : 0 / sec RX packets dropped : 7730632.90 / sec RX rate : 402.69 MB/sec # dpdk1 TX packets : 7037641.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 657.73 MB/sec TX processing cost per TX packets in nsec : 142.09 Test result on OVS master + DPDK 16.11.2 (With optimization): # dpdk0 RX packets : 9386809.60 / sec RX packet errors : 0 / sec RX packets dropped : 5381496.40 / sec RX rate : 537.11 MB/sec # dpdk1 TX packets : 9386809.60 / sec TX packet errors : 0 / sec TX packets dropped : 0 / sec TX rate : 877.29 MB/sec TX processing cost per TX packets in nsec : 106.53 The offered performance gain is approx 30%. Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com> Signed-off-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltán Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Joe Stringer <joe@ovn.org>
2017-07-19 14:46:03 +01:00
/* Restore the flows after the translation. */
memcpy(&ctx->xin->flow, &old_flow, sizeof ctx->xin->flow);
memcpy(&ctx->base_flow, &old_base_flow, sizeof ctx->base_flow);
/* Restore sFlow data. */
ctx->sflow_n_outputs = sflow_n_outputs;
return 0;
}
static void
xlate_commit_actions(struct xlate_ctx *ctx)
{
bool use_masked = ctx->xbridge->support.masked_set_action;
ctx->xout->slow |= commit_odp_actions(&ctx->xin->flow, &ctx->base_flow,
ctx->odp_actions, ctx->wc,
use_masked, ctx->pending_encap,
ctx->pending_decap, ctx->encap_data);
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
ctx->pending_encap = false;
ctx->pending_decap = false;
ofpbuf_delete(ctx->encap_data);
ctx->encap_data = NULL;
}
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
static void
clear_conntrack(struct xlate_ctx *ctx)
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
{
ctx->conntracked = false;
flow_clear_conntrack(&ctx->xin->flow);
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
}
static bool
xlate_flow_is_protected(const struct xlate_ctx *ctx, const struct flow *flow, const struct xport *xport_out)
{
const struct xport *xport_in;
if (!xport_out) {
return false;
}
xport_in = get_ofp_port(ctx->xbridge, flow->in_port.ofp_port);
return (xport_in && xport_in->xbundle && xport_out->xbundle &&
xport_in->xbundle->protected && xport_out->xbundle->protected);
}
/* Function handles when a packet is sent from one bridge to another bridge.
*
* The bridges are internally connected, either with patch ports or with
* tunnel ports.
*
* The output action to another bridge causes translation to continue within
* the next bridge. This process can be recursive; the next bridge can
* output yet to another bridge.
*
* The translated actions from the second bridge onwards are enclosed within
* the clone action, so that any modification to the packet will not be visible
* to the remaining actions of the originating bridge.
*/
static void
patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev,
struct xport *out_dev, bool is_last_action)
{
bool old_was_mpls = ctx->was_mpls;
struct flow *flow = &ctx->xin->flow;
bool old_conntrack = ctx->conntracked;
struct xretained_state *retained_state;
struct ovs_list *old_trace = ctx->xin->trace;
ovs_version_t old_version = ctx->xin->tables_version;
retained_state = xretain_state_save(ctx);
xretain_tunnel_mask_save(ctx, retained_state);
flow->in_port.ofp_port = out_dev->ofp_port;
flow->metadata = htonll(0);
memset(&flow->tunnel, 0, sizeof flow->tunnel);
memset(&ctx->wc->masks.tunnel, 0, sizeof ctx->wc->masks.tunnel);
flow->tunnel.metadata.tab =
ofproto_get_tun_tab(&out_dev->xbridge->ofproto->up);
ctx->wc->masks.tunnel.metadata.tab = flow->tunnel.metadata.tab;
memset(flow->regs, 0, sizeof flow->regs);
flow->actset_output = OFPP_UNSET;
clear_conntrack(ctx);
ctx->xin->trace = xlate_report(ctx, OFT_BRIDGE, "bridge(\"%s\")",
out_dev->xbridge->name);
mirror_mask_t old_mirrors = ctx->mirrors;
bool independent_mirrors = out_dev->xbridge != ctx->xbridge;
if (independent_mirrors) {
ctx->mirrors = 0;
}
ctx->xbridge = out_dev->xbridge;
/* The bridge is now known so obtain its table version. */
ctx->xin->tables_version
= ofproto_dpif_get_tables_version(ctx->xbridge->ofproto);
if (!process_special(ctx, out_dev) && may_receive(out_dev, ctx)) {
if (xport_stp_forward_state(out_dev) &&
xport_rstp_forward_state(out_dev)) {
xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true,
false, is_last_action, clone_xlate_actions);
if (!ctx->freezing) {
xlate_action_set(ctx);
}
if (ctx->freezing) {
finish_freezing(ctx);
}
} else {
/* Forwarding is disabled by STP and RSTP. Let OFPP_NORMAL and
* the learning action look at the packet, then drop it. */
size_t old_size = ctx->odp_actions->size;
xretain_base_flow_save(ctx, retained_state);
mirror_mask_t old_mirrors2 = ctx->mirrors;
xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true,
false, is_last_action, clone_xlate_actions);
ctx->mirrors = old_mirrors2;
xretain_base_flow_restore(ctx, retained_state);
ctx->odp_actions->size = old_size;
/* Undo changes that may have been done for freezing. */
ctx_cancel_freeze(ctx);
}
}
ctx->xin->trace = old_trace;
if (independent_mirrors) {
ctx->mirrors = old_mirrors;
}
ctx->xbridge = in_dev->xbridge;
/* Restore calling bridge's lookup version. */
ctx->xin->tables_version = old_version;
/* Restore to calling bridge tunneling information; the ctx flow, actions,
* and stack. And free the retained state. */
xretain_tunnel_mask_restore(ctx, retained_state);
xretain_state_restore_and_free(ctx, retained_state);
/* The out bridge popping MPLS should have no effect on the original
* bridge. */
ctx->was_mpls = old_was_mpls;
/* The out bridge's conntrack execution should have no effect on the
* original bridge. */
ctx->conntracked = old_conntrack;
/* The fact that the out bridge exits (for any reason) does not mean
* that the original bridge should exit. Specifically, if the out
* bridge freezes translation, the original bridge must continue
* processing with the original, not the frozen packet! */
ctx->exit = false;
/* Out bridge errors do not propagate back. */
ctx->error = XLATE_OK;
if (ctx->xin->resubmit_stats) {
netdev_vport_inc_tx(in_dev->netdev, ctx->xin->resubmit_stats);
netdev_vport_inc_rx(out_dev->netdev, ctx->xin->resubmit_stats);
if (out_dev->bfd) {
bfd_account_rx(out_dev->bfd, ctx->xin->resubmit_stats);
}
}
if (ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NETDEV);
entry->dev.tx = netdev_ref(in_dev->netdev);
entry->dev.rx = netdev_ref(out_dev->netdev);
entry->dev.bfd = bfd_ref(out_dev->bfd);
}
}
static bool
check_output_prerequisites(struct xlate_ctx *ctx,
const struct xport *xport,
struct flow *flow,
bool check_stp)
{
struct flow_wildcards *wc = ctx->wc;
if (!xport) {
xlate_report(ctx, OFT_WARN, "Nonexistent output port");
return false;
} else if (xport->config & OFPUTIL_PC_NO_FWD) {
xlate_report(ctx, OFT_DETAIL, "OFPPC_NO_FWD set, skipping output");
return false;
} else if (ctx->mirror_snaplen != 0 && xport->odp_port == ODPP_NONE) {
xlate_report(ctx, OFT_WARN,
"Mirror truncate to ODPP_NONE, skipping output");
return false;
} else if (xlate_flow_is_protected(ctx, flow, xport)) {
xlate_report(ctx, OFT_WARN,
"Flow is between protected ports, skipping output.");
return false;
} else if (check_stp) {
if (is_stp(&ctx->base_flow)) {
if (!xport_stp_should_forward_bpdu(xport) &&
!xport_rstp_should_manage_bpdu(xport)) {
if (ctx->xbridge->stp != NULL) {
xlate_report(ctx, OFT_WARN,
"STP not in listening state, "
"skipping bpdu output");
} else if (ctx->xbridge->rstp != NULL) {
xlate_report(ctx, OFT_WARN,
"RSTP not managing BPDU in this state, "
"skipping bpdu output");
}
return false;
}
} else if ((xport->cfm && cfm_should_process_flow(xport->cfm, flow, wc))
|| (xport->bfd && bfd_should_process_flow(xport->bfd, flow,
wc))) {
/* Pass; STP should not block link health detection. */
} else if (!xport_stp_forward_state(xport) ||
!xport_rstp_forward_state(xport)) {
if (ctx->xbridge->stp != NULL) {
xlate_report(ctx, OFT_WARN,
"STP not in forwarding state, skipping output");
} else if (ctx->xbridge->rstp != NULL) {
xlate_report(ctx, OFT_WARN,
"RSTP not in forwarding state, skipping output");
}
return false;
}
}
if (xport->pt_mode == NETDEV_PT_LEGACY_L2 &&
flow->packet_type != htonl(PT_ETH)) {
xlate_report(ctx, OFT_WARN, "Trying to send non-Ethernet packet "
"through legacy L2 port. Dropping packet.");
return false;
}
return true;
}
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
/* Function verifies if destination address of received Neighbor Advertisement
* message stored in 'flow' is correct. It should be either FF02::1:FFXX:XXXX
* where XX:XXXX stands for the last 24 bits of 'ipv6_addr' or it should match
* 'ipv6_addr'. */
static bool
is_nd_dst_correct(const struct flow *flow, const struct in6_addr *ipv6_addr)
{
const uint8_t *flow_ipv6_addr = (uint8_t *) &flow->ipv6_dst;
const uint8_t *addr = (uint8_t *) ipv6_addr;
return (IN6_IS_ADDR_MC_LINKLOCAL(&flow->ipv6_dst) &&
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
flow_ipv6_addr[11] == 0x01 &&
flow_ipv6_addr[12] == 0xff &&
flow_ipv6_addr[13] == addr[13] &&
flow_ipv6_addr[14] == addr[14] &&
flow_ipv6_addr[15] == addr[15]) ||
IN6_ARE_ADDR_EQUAL(&flow->ipv6_dst, ipv6_addr);
}
static bool
is_neighbor_reply_matched(const struct flow *flow, struct in6_addr *ip_addr)
{
return ((IN6_IS_ADDR_V4MAPPED(ip_addr) &&
flow->dl_type == htons(ETH_TYPE_ARP) &&
in6_addr_get_mapped_ipv4(ip_addr) == flow->nw_dst) ||
(!IN6_IS_ADDR_V4MAPPED(ip_addr) &&
is_nd_dst_correct(flow, ip_addr)));
}
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
/* Function verifies if the ARP reply or Neighbor Advertisement represented by
* 'flow' addresses the 'xbridge' of 'ctx'. Returns true if the ARP TA or
* neighbor discovery destination is in the list of configured IP addresses of
* the bridge. Otherwise, it returns false. */
static bool
is_neighbor_reply_correct(const struct xlate_ctx *ctx, const struct flow *flow)
{
bool ret = false;
int i;
struct xbridge_addr *xbridge_addr = xbridge_addr_ref(ctx->xbridge->addr);
/* Verify if 'nw_dst' of ARP or 'ipv6_dst' of ICMPV6 is in the list. */
for (i = 0; xbridge_addr && i < xbridge_addr->n_addr; i++) {
struct in6_addr *ip_addr = &xbridge_addr->addr[i];
if (is_neighbor_reply_matched(flow, ip_addr)) {
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
/* Found a match. */
ret = true;
break;
}
}
xbridge_addr_unref(xbridge_addr);
/* If not found in bridge's IPs, search in its ports. */
if (!ret) {
struct in6_addr *ip_addr, *mask;
struct xport *port;
int error, n_in6;
HMAP_FOR_EACH (port, ofp_node, &ctx->xbridge->xports) {
error = netdev_get_addr_list(port->netdev, &ip_addr,
&mask, &n_in6);
if (!error) {
ret = is_neighbor_reply_matched(flow, ip_addr);
free(ip_addr);
free(mask);
if (ret) {
/* Found a match. */
break;
}
}
}
}
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
return ret;
}
static bool
ofproto-dpif-xlate: Terminate native tunnels only on ports with IP addresses. Commit dc0bd12f5b04 removed restriction that tunnel endpoint must be a bridge port. So, currently OVS has to check if the native tunnel needs to be terminated regardless of the output port. Unfortunately, there is a side effect: tnl_port_map_lookup() always adds at least 'dl_dst' match to the megaflow that ends up in the corresponding datapath flow. And since tunneling works on L3 level and not restricted by any particular bridge, this extra match criteria is added to every datapath flow on every bridge even if that bridge cannot be part of a tunnel processing. For example, if OVS has at least one tunnel configured and we're adding a completely separate bridge with 2 ports and simple rules to forward packets between two ports, there still will be a match on a destination mac address: 1. <create a tunnel configuration in OVS> 2. ovs-vsctl add-br br-non-tunnel -- set bridge datapath_type=netdev 3. ovs-vsctl add-port br-non-tunnel port0 -- add-port br-non-tunnel port1 4. ovs-ofctl del-flows br-non-tunnel 5. ovs-ofctl add-flow br-non-tunnel in_port=port0,actions=port1 6. ovs-ofctl add-flow br-non-tunnel in_port=port1,actions=port0 # ovs-appctl ofproto/trace br-non-tunnel in_port=port0 Flow: in_port=1,vlan_tci=0x0000, dl_src=00:00:00:00:00:00,dl_dst=00:00:00:00:00:00,dl_type=0x0000 bridge("br-non-tunnel") ----------------------- 0. in_port=1, priority 32768 output:2 Final flow: unchanged Megaflow: recirc_id=0,eth,in_port=1,dl_dst=00:00:00:00:00:00,dl_type=0x0000 Datapath actions: 5 ^^^^^^^^^^^^^^^^^^^^^^^^ This increases the number of upcalls and installed datapath flows, since separate flow needs to be installed per destination MAC, reducing the switching performance. This also blocks datapath performance optimizations that are based on the datapath flow simplicity. In general, in order to be a tunnel endpoint, port has to have an IP address. Hence native tunnel termination should be attempted only for such ports. This allows to avoid extra matches in most cases. Fixes: dc0bd12f5b04 ("userspace: Enable non-bridge port as tunnel endpoint.") Reported-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-October/388904.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Mike Pattrick <mkp@redhat.com>
2021-11-01 21:14:38 +01:00
xport_has_ip(const struct xport *xport)
{
struct in6_addr *ip_addr, *mask;
int n_in6 = 0;
if (netdev_get_addr_list(xport->netdev, &ip_addr, &mask, &n_in6)) {
n_in6 = 0;
} else {
free(ip_addr);
free(mask);
}
return n_in6 ? true : false;
}
static bool check_neighbor_reply(struct xlate_ctx *ctx, struct flow *flow)
{
if (flow->dl_type == htons(ETH_TYPE_ARP) ||
flow->nw_proto == IPPROTO_ICMPV6) {
return is_neighbor_reply_correct(ctx, flow);
}
return false;
}
ofproto-dpif-xlate: Terminate native tunnels only on ports with IP addresses. Commit dc0bd12f5b04 removed restriction that tunnel endpoint must be a bridge port. So, currently OVS has to check if the native tunnel needs to be terminated regardless of the output port. Unfortunately, there is a side effect: tnl_port_map_lookup() always adds at least 'dl_dst' match to the megaflow that ends up in the corresponding datapath flow. And since tunneling works on L3 level and not restricted by any particular bridge, this extra match criteria is added to every datapath flow on every bridge even if that bridge cannot be part of a tunnel processing. For example, if OVS has at least one tunnel configured and we're adding a completely separate bridge with 2 ports and simple rules to forward packets between two ports, there still will be a match on a destination mac address: 1. <create a tunnel configuration in OVS> 2. ovs-vsctl add-br br-non-tunnel -- set bridge datapath_type=netdev 3. ovs-vsctl add-port br-non-tunnel port0 -- add-port br-non-tunnel port1 4. ovs-ofctl del-flows br-non-tunnel 5. ovs-ofctl add-flow br-non-tunnel in_port=port0,actions=port1 6. ovs-ofctl add-flow br-non-tunnel in_port=port1,actions=port0 # ovs-appctl ofproto/trace br-non-tunnel in_port=port0 Flow: in_port=1,vlan_tci=0x0000, dl_src=00:00:00:00:00:00,dl_dst=00:00:00:00:00:00,dl_type=0x0000 bridge("br-non-tunnel") ----------------------- 0. in_port=1, priority 32768 output:2 Final flow: unchanged Megaflow: recirc_id=0,eth,in_port=1,dl_dst=00:00:00:00:00:00,dl_type=0x0000 Datapath actions: 5 ^^^^^^^^^^^^^^^^^^^^^^^^ This increases the number of upcalls and installed datapath flows, since separate flow needs to be installed per destination MAC, reducing the switching performance. This also blocks datapath performance optimizations that are based on the datapath flow simplicity. In general, in order to be a tunnel endpoint, port has to have an IP address. Hence native tunnel termination should be attempted only for such ports. This allows to avoid extra matches in most cases. Fixes: dc0bd12f5b04 ("userspace: Enable non-bridge port as tunnel endpoint.") Reported-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-October/388904.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Mike Pattrick <mkp@redhat.com>
2021-11-01 21:14:38 +01:00
static bool
terminate_native_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
struct flow *flow, struct flow_wildcards *wc,
odp_port_t *tnl_port)
{
*tnl_port = ODPP_NONE;
/* XXX: Write better Filter for tunnel port. We can use in_port
ofproto-dpif-xlate: Terminate native tunnels only on ports with IP addresses. Commit dc0bd12f5b04 removed restriction that tunnel endpoint must be a bridge port. So, currently OVS has to check if the native tunnel needs to be terminated regardless of the output port. Unfortunately, there is a side effect: tnl_port_map_lookup() always adds at least 'dl_dst' match to the megaflow that ends up in the corresponding datapath flow. And since tunneling works on L3 level and not restricted by any particular bridge, this extra match criteria is added to every datapath flow on every bridge even if that bridge cannot be part of a tunnel processing. For example, if OVS has at least one tunnel configured and we're adding a completely separate bridge with 2 ports and simple rules to forward packets between two ports, there still will be a match on a destination mac address: 1. <create a tunnel configuration in OVS> 2. ovs-vsctl add-br br-non-tunnel -- set bridge datapath_type=netdev 3. ovs-vsctl add-port br-non-tunnel port0 -- add-port br-non-tunnel port1 4. ovs-ofctl del-flows br-non-tunnel 5. ovs-ofctl add-flow br-non-tunnel in_port=port0,actions=port1 6. ovs-ofctl add-flow br-non-tunnel in_port=port1,actions=port0 # ovs-appctl ofproto/trace br-non-tunnel in_port=port0 Flow: in_port=1,vlan_tci=0x0000, dl_src=00:00:00:00:00:00,dl_dst=00:00:00:00:00:00,dl_type=0x0000 bridge("br-non-tunnel") ----------------------- 0. in_port=1, priority 32768 output:2 Final flow: unchanged Megaflow: recirc_id=0,eth,in_port=1,dl_dst=00:00:00:00:00:00,dl_type=0x0000 Datapath actions: 5 ^^^^^^^^^^^^^^^^^^^^^^^^ This increases the number of upcalls and installed datapath flows, since separate flow needs to be installed per destination MAC, reducing the switching performance. This also blocks datapath performance optimizations that are based on the datapath flow simplicity. In general, in order to be a tunnel endpoint, port has to have an IP address. Hence native tunnel termination should be attempted only for such ports. This allows to avoid extra matches in most cases. Fixes: dc0bd12f5b04 ("userspace: Enable non-bridge port as tunnel endpoint.") Reported-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-October/388904.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Mike Pattrick <mkp@redhat.com>
2021-11-01 21:14:38 +01:00
* in tunnel-port flow to avoid these checks completely.
*
* Port without an IP address cannot be a tunnel termination point.
* Not performing a lookup in this case to avoid unwildcarding extra
* flow fields (dl_dst). */
if (ovs_native_tunneling_is_on(ctx->xbridge->ofproto)
&& xport_has_ip(xport)) {
*tnl_port = tnl_port_map_lookup(flow, wc);
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
/* If no tunnel port was found and it's about an ARP or ICMPv6 packet,
* do tunnel neighbor snooping. */
if (*tnl_port == ODPP_NONE &&
(check_neighbor_reply(ctx, flow) || is_garp(flow, wc))) {
tnl_neigh_snoop(flow, wc, ctx->xbridge->name,
ctx->xin->allow_side_effects);
} else if (*tnl_port != ODPP_NONE &&
ctx->xin->allow_side_effects &&
dl_type_is_ip_any(flow->dl_type)) {
struct eth_addr mac = flow->dl_src;
struct in6_addr s_ip6;
if (flow->dl_type == htons(ETH_TYPE_IP)) {
in6_addr_set_mapped_ipv4(&s_ip6, flow->nw_src);
} else {
s_ip6 = flow->ipv6_src;
}
tnl_neigh_set(ctx->xbridge->name, &s_ip6, mac);
xlate: Move tnl_neigh_snoop() to terminate_native_tunnel() Currently OVS snoops any ARP or ND packets in any bridge and populates the tunnel neighbor cache with the retreived data. For instance, when an ARP reply originated by a tenant is received in an overlay bridge, the ARP packet is snooped and tunnel neighbor cache is filled with tenant address information. This is at best useless as tunnel endpoints can only reside on an underlay bridge. The real problem starts if different tenants on the overlay bridge have overlapping IP addresses such that they keep overwriting each other's pseudo tunnel neighbor entries. These frequent updates are treated as configuration changes and trigger revalidation each time, thus causing a lot of useless revalidation load on the system. To keep the ARP neighbor cache clean, this patch moves tunnel neighbor snooping from the generic function do_xlate_actions() to the specific funtion terminate_native_tunnel() in compose_output_action(). Thus, only ARP and Neighbor Advertisement packets addressing a local tunnel endpoint (on the LOCAL port of the underlay bridge) are snooped. In order to achieve this, IP addresses of the bridge ports are retrieved and then stored in xbridge by calling xlate_xbridge_set(). The destination address extracted from the ARP or Neighbor Advertisement packet is then matched against the known xbridge addresses in is_neighbor_reply_correct() to filter the snooped packets further. Signed-off-by: Zoltan Balogh <zoltan.balogh.eth@gmail.com> Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-04-04 23:57:54 +02:00
}
}
return *tnl_port != ODPP_NONE;
}
static void
compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
const struct xlate_bond_recirc *xr, bool check_stp,
bool is_last_action, bool truncate)
{
const struct xport *xport = get_ofp_port(ctx->xbridge, ofp_port);
struct flow_wildcards *wc = ctx->wc;
struct flow *flow = &ctx->xin->flow;
struct flow_tnl *flow_tnl = NULL;
union flow_vlan_hdr flow_vlans[FLOW_MAX_VLAN_HEADERS];
uint8_t flow_nw_tos;
odp_port_t out_port, odp_port, odp_tnl_port;
bool is_native_tunnel = false;
uint8_t dscp;
struct eth_addr flow_dl_dst = flow->dl_dst;
struct eth_addr flow_dl_src = flow->dl_src;
ovs_be32 flow_packet_type = flow->packet_type;
ovs_be16 flow_dl_type = flow->dl_type;
/* If 'struct flow' gets additional metadata, we'll need to zero it out
* before traversing a patch port. */
BUILD_ASSERT_DECL(FLOW_WC_SEQ == 42);
if (!check_output_prerequisites(ctx, xport, flow, check_stp)) {
return;
}
if (flow->packet_type == htonl(PT_ETH)) {
/* Strip Ethernet header for legacy L3 port. */
if (xport->pt_mode == NETDEV_PT_LEGACY_L3) {
flow->packet_type = PACKET_TYPE_BE(OFPHTN_ETHERTYPE,
ntohs(flow->dl_type));
2022-04-04 00:26:17 +02:00
if (ctx->pending_encap) {
/* The Ethernet header was not actually added yet. */
ctx->pending_encap = false;
}
}
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
}
if (xport->peer) {
if (truncate) {
xlate_report_error(ctx, "Cannot truncate output to patch port");
}
patch_port_output(ctx, xport, xport->peer, is_last_action);
return;
}
memcpy(flow_vlans, flow->vlans, sizeof flow_vlans);
flow_nw_tos = flow->nw_tos;
if (count_skb_priorities(xport)) {
memset(&wc->masks.skb_priority, 0xff, sizeof wc->masks.skb_priority);
if (dscp_from_skb_priority(xport, flow->skb_priority, &dscp)) {
wc->masks.nw_tos |= IP_DSCP_MASK;
flow->nw_tos &= ~IP_DSCP_MASK;
flow->nw_tos |= dscp;
}
}
if (xport->is_tunnel) {
struct in6_addr dst;
/* Save tunnel metadata so that changes made due to
* the Logical (tunnel) Port are not visible for any further
* matches, while explicit set actions on tunnel metadata are.
*/
flow_tnl = xmemdup(&flow->tunnel, sizeof *flow_tnl);
odp_port = tnl_port_send(xport->ofport, flow, ctx->wc);
if (odp_port == ODPP_NONE) {
xlate_report(ctx, OFT_WARN, "Tunneling decided against output");
goto out; /* restore flow_nw_tos */
}
dst = flow_tnl_dst(&flow->tunnel);
if (ipv6_addr_equals(&dst, &ctx->orig_tunnel_ipv6_dst)) {
xlate_report(ctx, OFT_WARN, "Not tunneling to our own address");
goto out; /* restore flow_nw_tos */
}
if (ctx->xin->resubmit_stats) {
netdev_vport_inc_tx(xport->netdev, ctx->xin->resubmit_stats);
}
if (ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NETDEV);
entry->dev.tx = netdev_ref(xport->netdev);
}
out_port = odp_port;
if (ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) {
xlate_report(ctx, OFT_DETAIL, "output to native tunnel");
is_native_tunnel = true;
} else {
const char *tnl_type;
xlate_report(ctx, OFT_DETAIL, "output to kernel tunnel");
tnl_type = tnl_port_get_type(xport->ofport);
commit_odp_tunnel_action(flow, &ctx->base_flow,
ctx->odp_actions, tnl_type);
flow->tunnel = *flow_tnl; /* Restore tunnel metadata. */
}
} else {
odp_port = xport->odp_port;
out_port = odp_port;
}
if (out_port != ODPP_NONE) {
/* Commit accumulated flow updates before output. */
xlate_commit_actions(ctx);
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
if (xr && bond_use_lb_output_action(xport->xbundle->bond)) {
/*
* If bond mode is balance-tcp and optimize balance tcp is enabled
* then use the hash directly for member selection and avoid
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
* recirculation.
*
* Currently support for netdev datapath only.
*/
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_LB_OUTPUT,
xr->recirc_id);
} else if (xr) {
/* Recirculate the packet. */
struct ovs_action_hash *act_hash;
/* Hash action. */
enum ovs_hash_alg hash_alg = xr->hash_alg;
if (hash_alg > ctx->xbridge->support.max_hash_alg) {
/* Algorithm supported by all datapaths. */
hash_alg = OVS_HASH_ALG_L4;
}
act_hash = nl_msg_put_unspec_uninit(ctx->odp_actions,
OVS_ACTION_ATTR_HASH,
sizeof *act_hash);
act_hash->hash_alg = hash_alg;
act_hash->hash_basis = xr->hash_basis;
/* Recirc action. */
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC,
xr->recirc_id);
} else if (is_native_tunnel) {
/* Output to native tunnel port. */
native_tunnel_output(ctx, xport, flow, odp_port, truncate,
is_last_action);
ovs_assert(flow_tnl);
flow->tunnel = *flow_tnl; /* Restore tunnel metadata. */
ofproto-dpif-xlate: Terminate native tunnels only on ports with IP addresses. Commit dc0bd12f5b04 removed restriction that tunnel endpoint must be a bridge port. So, currently OVS has to check if the native tunnel needs to be terminated regardless of the output port. Unfortunately, there is a side effect: tnl_port_map_lookup() always adds at least 'dl_dst' match to the megaflow that ends up in the corresponding datapath flow. And since tunneling works on L3 level and not restricted by any particular bridge, this extra match criteria is added to every datapath flow on every bridge even if that bridge cannot be part of a tunnel processing. For example, if OVS has at least one tunnel configured and we're adding a completely separate bridge with 2 ports and simple rules to forward packets between two ports, there still will be a match on a destination mac address: 1. <create a tunnel configuration in OVS> 2. ovs-vsctl add-br br-non-tunnel -- set bridge datapath_type=netdev 3. ovs-vsctl add-port br-non-tunnel port0 -- add-port br-non-tunnel port1 4. ovs-ofctl del-flows br-non-tunnel 5. ovs-ofctl add-flow br-non-tunnel in_port=port0,actions=port1 6. ovs-ofctl add-flow br-non-tunnel in_port=port1,actions=port0 # ovs-appctl ofproto/trace br-non-tunnel in_port=port0 Flow: in_port=1,vlan_tci=0x0000, dl_src=00:00:00:00:00:00,dl_dst=00:00:00:00:00:00,dl_type=0x0000 bridge("br-non-tunnel") ----------------------- 0. in_port=1, priority 32768 output:2 Final flow: unchanged Megaflow: recirc_id=0,eth,in_port=1,dl_dst=00:00:00:00:00:00,dl_type=0x0000 Datapath actions: 5 ^^^^^^^^^^^^^^^^^^^^^^^^ This increases the number of upcalls and installed datapath flows, since separate flow needs to be installed per destination MAC, reducing the switching performance. This also blocks datapath performance optimizations that are based on the datapath flow simplicity. In general, in order to be a tunnel endpoint, port has to have an IP address. Hence native tunnel termination should be attempted only for such ports. This allows to avoid extra matches in most cases. Fixes: dc0bd12f5b04 ("userspace: Enable non-bridge port as tunnel endpoint.") Reported-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-October/388904.html Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Mike Pattrick <mkp@redhat.com>
2021-11-01 21:14:38 +01:00
} else if (terminate_native_tunnel(ctx, xport, flow, wc,
&odp_tnl_port)) {
/* Intercept packet to be received on native tunnel port. */
nl_msg_put_odp_port(ctx->odp_actions, OVS_ACTION_ATTR_TUNNEL_POP,
odp_tnl_port);
} else {
/* Tunnel push-pop action is not compatible with
* IPFIX action. */
compose_ipfix_action(ctx, out_port);
/* Handle truncation of the mirrored packet. */
if (ctx->mirror_snaplen > 0 &&
ctx->mirror_snaplen < UINT16_MAX) {
struct ovs_action_trunc *trunc;
trunc = nl_msg_put_unspec_uninit(ctx->odp_actions,
OVS_ACTION_ATTR_TRUNC,
sizeof *trunc);
trunc->max_len = ctx->mirror_snaplen;
if (!ctx->xbridge->support.trunc) {
ctx->xout->slow |= SLOW_ACTION;
}
}
nl_msg_put_odp_port(ctx->odp_actions,
OVS_ACTION_ATTR_OUTPUT,
out_port);
}
ctx->sflow_odp_port = odp_port;
ctx->sflow_n_outputs++;
ctx->nf_output_iface = ofp_port;
}
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
if (mbridge_has_mirrors(ctx->xbridge->mbridge) && xport->xbundle) {
mirror_packet(ctx, xport->xbundle,
xbundle_mirror_dst(xport->xbundle->xbridge,
xport->xbundle));
}
out:
/* Restore flow */
memcpy(flow->vlans, flow_vlans, sizeof flow->vlans);
flow->nw_tos = flow_nw_tos;
flow->dl_dst = flow_dl_dst;
flow->dl_src = flow_dl_src;
flow->packet_type = flow_packet_type;
flow->dl_type = flow_dl_type;
free(flow_tnl);
}
static void
compose_output_action(struct xlate_ctx *ctx, ofp_port_t ofp_port,
const struct xlate_bond_recirc *xr,
bool is_last_action, bool truncate)
{
compose_output_action__(ctx, ofp_port, xr, true,
is_last_action, truncate);
}
static void
xlate_recursively(struct xlate_ctx *ctx, struct rule_dpif *rule,
bool deepens, bool is_last_action,
xlate_actions_handler *actions_xlator)
{
struct rule_dpif *old_rule = ctx->rule;
ovs_be64 old_cookie = ctx->rule_cookie;
const struct rule_actions *actions;
if (ctx->xin->resubmit_stats) {
Add offload packets statistics Add argument '--offload-stats' for command ovs-appctl bridge/dump-flows to display the offloaded packets statistics. The commands display as below: orignal command: ovs-appctl bridge/dump-flows br0 duration=574s, n_packets=1152, n_bytes=110768, priority=0,actions=NORMAL table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=2,recirc_id=0,actions=drop table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x1,actions=controller(reason=) table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x2,actions=drop table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x3,actions=drop new command with argument '--offload-stats' Notice: 'n_offload_packets' are a subset of n_packets and 'n_offload_bytes' are a subset of n_bytes. ovs-appctl bridge/dump-flows --offload-stats br0 duration=582s, n_packets=1152, n_bytes=110768, n_offload_packets=1107, n_offload_bytes=107992, priority=0,actions=NORMAL table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=2,recirc_id=0,actions=drop table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x1,actions=controller(reason=) table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x2,actions=drop table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x3,actions=drop Signed-off-by: zhaozhanxu <zhaozhanxu@163.com> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2019-12-05 14:26:25 +08:00
rule_dpif_credit_stats(rule, ctx->xin->resubmit_stats, false);
}
ctx->resubmits++;
ctx->depth += deepens;
ctx->rule = rule;
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
ctx->rule_cookie = rule->up.flow_cookie;
actions = rule_get_actions(&rule->up);
actions_xlator(actions->ofpacts, actions->ofpacts_len, ctx,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
is_last_action, false);
ctx->rule_cookie = old_cookie;
ctx->rule = old_rule;
ctx->depth -= deepens;
}
static bool
xlate_resubmit_resource_check(struct xlate_ctx *ctx)
{
if (ctx->depth >= MAX_DEPTH) {
xlate_report_error(ctx, "over max translation depth %d", MAX_DEPTH);
ctx->error = XLATE_RECURSION_TOO_DEEP;
} else if (ctx->resubmits >= MAX_RESUBMITS) {
xlate_report_error(ctx, "over %d resubmit actions", MAX_RESUBMITS);
ctx->error = XLATE_TOO_MANY_RESUBMITS;
} else if (ctx->odp_actions->size > UINT16_MAX) {
xlate_report_error(ctx, "resubmits yielded over 64 kB of actions");
/* NOT an error, as we'll be slow-pathing the flow in this case? */
ctx->exit = true; /* XXX: translation still terminated! */
} else if (ctx->stack.size >= 65536) {
xlate_report_error(ctx, "resubmits yielded over 64 kB of stack");
ctx->error = XLATE_STACK_TOO_DEEP;
} else {
return true;
}
return false;
}
static void
tuple_swap_flow(struct flow *flow, bool ipv4)
{
uint8_t nw_proto = flow->nw_proto;
flow->nw_proto = flow->ct_nw_proto;
flow->ct_nw_proto = nw_proto;
if (ipv4) {
ovs_be32 nw_src = flow->nw_src;
flow->nw_src = flow->ct_nw_src;
flow->ct_nw_src = nw_src;
ovs_be32 nw_dst = flow->nw_dst;
flow->nw_dst = flow->ct_nw_dst;
flow->ct_nw_dst = nw_dst;
} else {
struct in6_addr ipv6_src = flow->ipv6_src;
flow->ipv6_src = flow->ct_ipv6_src;
flow->ct_ipv6_src = ipv6_src;
struct in6_addr ipv6_dst = flow->ipv6_dst;
flow->ipv6_dst = flow->ct_ipv6_dst;
flow->ct_ipv6_dst = ipv6_dst;
}
ovs_be16 tp_src = flow->tp_src;
flow->tp_src = flow->ct_tp_src;
flow->ct_tp_src = tp_src;
ovs_be16 tp_dst = flow->tp_dst;
flow->tp_dst = flow->ct_tp_dst;
flow->ct_tp_dst = tp_dst;
}
static void
tuple_swap(struct flow *flow, struct flow_wildcards *wc)
{
bool ipv4 = (flow->dl_type == htons(ETH_TYPE_IP));
tuple_swap_flow(flow, ipv4);
tuple_swap_flow(&wc->masks, ipv4);
}
static void
xlate_table_action(struct xlate_ctx *ctx, ofp_port_t in_port, uint8_t table_id,
bool may_packet_in, bool honor_table_miss,
bool with_ct_orig, bool is_last_action,
xlate_actions_handler *xlator)
{
/* Check if we need to recirculate before matching in a table. */
if (ctx->was_mpls) {
ctx_trigger_freeze(ctx);
return;
}
if (xlate_resubmit_resource_check(ctx)) {
uint8_t old_table_id = ctx->table_id;
struct rule_dpif *rule;
ctx->table_id = table_id;
/* Swap packet fields with CT 5-tuple if requested. */
if (with_ct_orig) {
/* Do not swap if there is no CT tuple, or if key is not IP. */
if (ctx->xin->flow.ct_nw_proto == 0 ||
!is_ip_any(&ctx->xin->flow)) {
xlate_report_error(ctx,
"resubmit(ct) with non-tracked or non-IP packet!");
ctx->table_id = old_table_id;
return;
}
tuple_swap(&ctx->xin->flow, ctx->wc);
}
rule = rule_dpif_lookup_from_table(ctx->xbridge->ofproto,
ctx->xin->tables_version,
&ctx->xin->flow, ctx->wc,
ctx->xin->resubmit_stats,
&ctx->table_id, in_port,
may_packet_in, honor_table_miss,
ctx->xin->xcache);
/* Swap back. */
if (with_ct_orig) {
tuple_swap(&ctx->xin->flow, ctx->wc);
}
if (rule) {
/* Fill in the cache entry here instead of xlate_recursively
* to make the reference counting more explicit. We take a
* reference in the lookups above if we are going to cache the
* rule. */
if (ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_RULE);
entry->rule = rule;
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
ofproto_rule_ref(&rule->up);
}
struct ovs_list *old_trace = ctx->xin->trace;
xlate_report_table(ctx, rule, table_id);
xlate_recursively(ctx, rule, table_id <= old_table_id,
is_last_action, xlator);
ctx->xin->trace = old_trace;
}
ctx->table_id = old_table_id;
return;
}
}
/* Consumes the group reference, which is only taken if xcache exists. */
static void
xlate_group_stats(struct xlate_ctx *ctx, struct group_dpif *group,
struct ofputil_bucket *bucket)
{
if (ctx->xin->resubmit_stats) {
group_dpif_credit_stats(group, bucket, ctx->xin->resubmit_stats);
}
if (ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_GROUP);
entry->group.group = group;
entry->group.bucket = bucket;
}
}
static void
xlate_group_bucket(struct xlate_ctx *ctx, struct ofputil_bucket *bucket,
bool is_last_action)
{
struct ovs_list *old_trace = ctx->xin->trace;
if (OVS_UNLIKELY(ctx->xin->trace)) {
char *s = xasprintf("bucket %"PRIu32, bucket->bucket_id);
ctx->xin->trace = &oftrace_report(ctx->xin->trace, OFT_BUCKET,
s)->subs;
free(s);
}
uint64_t action_list_stub[1024 / 8];
struct ofpbuf action_list = OFPBUF_STUB_INITIALIZER(action_list_stub);
struct ofpbuf action_set = ofpbuf_const_initializer(bucket->ofpacts,
bucket->ofpacts_len);
struct flow old_flow = ctx->xin->flow;
bool old_was_mpls = ctx->was_mpls;
ofpacts_execute_action_set(&action_list, &action_set);
ctx->depth++;
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
do_xlate_actions(action_list.data, action_list.size, ctx, is_last_action,
true);
ctx->depth--;
ofpbuf_uninit(&action_list);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
/* Check if need to freeze. */
if (ctx->freezing) {
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
finish_freezing(ctx);
}
/* Roll back flow to previous state.
* This is equivalent to cloning the packet for each bucket.
*
* As a side effect any subsequently applied actions will
* also effectively be applied to a clone of the packet taken
* just before applying the all or indirect group.
*
* Note that group buckets are action sets, hence they cannot modify the
* main action set. Also any stack actions are ignored when executing an
* action set, so group buckets cannot directly change the stack either.
* However, we do allow resubmit actions in group buckets, which could
* recursively execute actions that do modify the action set or change the
* stack. The controller must be careful about what it does to the
* action_set and stack in the tables resubmitted to from group buckets. */
ctx->xin->flow = old_flow;
/* The group bucket popping MPLS should have no effect after bucket
* execution. */
ctx->was_mpls = old_was_mpls;
/* The fact that the group bucket exits (for any reason) does not mean that
* the translation after the group action should exit. Specifically, if
* the group bucket freezes translation, the actions after the group action
* must continue processing with the original, not the frozen packet! */
ctx->exit = false;
ofproto-dpif-xlate: Incorrect handling of errors in group action processing As per OpenFlow v1.3 specification, when an action list contains a group action a copy of the packet is passed to the group for processing by the group. This means that if there is an error encountered during group processing, only the copy of packet should be dropped, but subsequent actions in the action list should be executed on the original packet. Additionally, if the group type is "ALL", each action bucket of the group should process a copy of the packet. If there is an error while processing one bucket other buckets should still be processed. Example 1: table=0,in_port=tap0 actions=output:tap1,group:10,output:tap2 Even if any error is encountered while processing the group action, the packet should still be forwarded to ports tap1 and tap2. Example 2: group_id=1,type=all,bucket=actions=output:tap1,bucket=actions=encap(eth) Even if processing the action in the second bucket fails because the packet already has an Ethernet header, the other copy of the packet should still be processed by the first bucket and output to port tap1. Currently the error handling in OVS does not comply with those rules. When any group bucket execution fails the error is recorded in the so-called "translation context" which is global for the processing of the original packet. Once an error is recorded, OVS skips processing subsequent buckets and installs a drop action in the datapath even if parts of the action list were previously processed successfully. This patch clears the error flag after any bucket of a group is executed. This way the error encountered in processing any bucket of the group will not impact other actions of action-list or other buckets of the group. Errors which are system limits to protect translation from taking too long time or too much space are not cleared. Instead drop action is installed for them. Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-01-10 08:56:32 +00:00
/* Context error in a bucket should not impact processing of other buckets
* or actions. This is similar to cloning a packet for group buckets.
* There is no need to restore the error back to old value due to the fact
* that we actually processed group action which can happen only when there
* is no previous context error.
*
* Exception to above is errors which are system limits to protect
* translation from running too long or occupy too much space. These errors
* should not be masked. XLATE_RECURSION_TOO_DEEP, XLATE_TOO_MANY_RESUBMITS
* and XLATE_STACK_TOO_DEEP fall in this category. */
if (ctx->error == XLATE_TOO_MANY_MPLS_LABELS ||
ctx->error == XLATE_UNSUPPORTED_PACKET_TYPE) {
/* reset the error and continue processing other buckets */
ctx->error = XLATE_OK;
}
ctx->xin->trace = old_trace;
}
static struct ofputil_bucket *
pick_ff_group(struct xlate_ctx *ctx, struct group_dpif *group)
{
return group_first_live_bucket(ctx, group, 0);
}
static struct ofputil_bucket *
pick_default_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
{
flow_mask_hash_fields(&ctx->xin->flow, ctx->wc,
NX_HASH_FIELDS_SYMMETRIC_L4);
return group_best_live_bucket(ctx, group,
flow_hash_symmetric_l4(&ctx->xin->flow, 0));
}
static struct ofputil_bucket *
pick_hash_fields_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
{
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
const struct field_array *fields = &group->up.props.fields;
const uint8_t *mask_values = fields->values;
uint32_t basis = hash_uint64(group->up.props.selection_method_param);
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
size_t i;
BITMAP_FOR_EACH_1 (i, MFF_N_IDS, fields->used.bm) {
const struct mf_field *mf = mf_from_id(i);
/* Skip fields for which prerequisites are not met. */
if (!mf_are_prereqs_ok(mf, &ctx->xin->flow, ctx->wc)) {
/* Skip the mask bytes for this field. */
mask_values += mf->n_bytes;
continue;
}
union mf_value value;
union mf_value mask;
mf_get_value(mf, &ctx->xin->flow, &value);
/* Mask the value. */
for (int j = 0; j < mf->n_bytes; j++) {
mask.b[j] = *mask_values++;
value.b[j] &= mask.b[j];
}
basis = hash_bytes(&value, mf->n_bytes, basis);
/* For tunnels, hash in whether the field is present. */
if (mf_is_tun_metadata(mf)) {
basis = hash_boolean(mf_is_set(mf, &ctx->xin->flow), basis);
}
mf_mask_field_masked(mf, &mask, ctx->wc);
}
return group_best_live_bucket(ctx, group, basis);
}
static struct ofputil_bucket *
pick_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
{
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
uint32_t dp_hash = ctx->xin->flow.dp_hash;
/* dp_hash value 0 is special since it means that the dp_hash has not been
* computed, as all computed dp_hash values are non-zero. Therefore
* compare to zero can be used to decide if the dp_hash value is valid
* without masking the dp_hash field. */
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
if (!dp_hash) {
enum ovs_hash_alg hash_alg = group->hash_alg;
if (hash_alg > ctx->xbridge->support.max_hash_alg) {
/* Algorithm supported by all datapaths. */
hash_alg = OVS_HASH_ALG_L4;
}
ctx_trigger_recirculate_with_hash(ctx, hash_alg, group->hash_basis);
return NULL;
} else {
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
uint32_t hash_mask = group->hash_mask;
ctx->wc->masks.dp_hash |= hash_mask;
/* Starting from the original masked dp_hash value iterate over the
* hash mapping table to find the first live bucket. As the buckets
* are quasi-randomly spread over the hash values, this maintains
* a distribution according to bucket weights even when some buckets
* are non-live. */
for (int i = 0; i <= hash_mask; i++) {
struct ofputil_bucket *b =
group->hash_map[(dp_hash + i) & hash_mask];
if (bucket_is_alive(ctx, group, b, 0)) {
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
return b;
}
}
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
return NULL;
}
}
static struct ofputil_bucket *
pick_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
{
/* Select groups may access flow keys beyond L2 in order to
* select a bucket. Recirculate as appropriate to make this possible.
*/
if (ctx->was_mpls) {
ctx_trigger_freeze(ctx);
return NULL;
}
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
switch (group->selection_method) {
case SEL_METHOD_DEFAULT:
return pick_default_select_group(ctx, group);
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
break;
case SEL_METHOD_HASH:
return pick_hash_fields_select_group(ctx, group);
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
break;
case SEL_METHOD_DP_HASH:
return pick_dp_hash_select_group(ctx, group);
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
break;
default:
/* Parsing of groups ensures this never happens */
OVS_NOT_REACHED();
}
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
return NULL;
}
static void
xlate_group_action__(struct xlate_ctx *ctx, struct group_dpif *group,
bool is_last_action)
{
if (group->up.type == OFPGT11_ALL || group->up.type == OFPGT11_INDIRECT) {
struct ovs_list *last_bucket = group->up.buckets.prev;
struct ofputil_bucket *bucket;
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
bool is_last_bucket = &bucket->list_node == last_bucket;
xlate_group_bucket(ctx, bucket, is_last_action && is_last_bucket);
}
xlate_group_stats(ctx, group, NULL);
} else {
struct ofputil_bucket *bucket;
if (group->up.type == OFPGT11_SELECT) {
bucket = pick_select_group(ctx, group);
} else if (group->up.type == OFPGT11_FF) {
bucket = pick_ff_group(ctx, group);
} else {
OVS_NOT_REACHED();
}
if (bucket) {
xlate_report(ctx, OFT_DETAIL, "using bucket %"PRIu32,
bucket->bucket_id);
xlate_group_bucket(ctx, bucket, is_last_action);
xlate_group_stats(ctx, group, bucket);
} else {
xlate_report(ctx, OFT_DETAIL, "no live bucket");
if (ctx->xin->xcache) {
ofproto_group_unref(&group->up);
}
}
}
}
static bool
xlate_group_action(struct xlate_ctx *ctx, uint32_t group_id,
bool is_last_action)
{
if (xlate_resubmit_resource_check(ctx)) {
struct group_dpif *group;
/* Take ref only if xcache exists. */
group = group_dpif_lookup(ctx->xbridge->ofproto, group_id,
ctx->xin->tables_version, ctx->xin->xcache);
if (!group) {
/* XXX: Should set ctx->error ? */
xlate_report(ctx, OFT_WARN, "output to nonexistent group %"PRIu32,
group_id);
return true;
}
xlate_group_action__(ctx, group, is_last_action);
}
return false;
}
static void
xlate_ofpact_resubmit(struct xlate_ctx *ctx,
const struct ofpact_resubmit *resubmit,
bool is_last_action)
{
ofp_port_t in_port;
uint8_t table_id;
bool may_packet_in = false;
bool honor_table_miss = false;
if (ctx->rule && rule_dpif_is_internal(ctx->rule)) {
/* Still allow missed packets to be sent to the controller
* if resubmitting from an internal table. */
may_packet_in = true;
honor_table_miss = true;
}
in_port = resubmit->in_port;
if (in_port == OFPP_IN_PORT) {
in_port = ctx->xin->flow.in_port.ofp_port;
}
table_id = resubmit->table_id;
if (table_id == 255) {
table_id = ctx->table_id;
}
xlate_table_action(ctx, in_port, table_id, may_packet_in,
honor_table_miss, resubmit->with_ct_orig,
is_last_action, do_xlate_actions);
}
static void
flood_packet_to_port(struct xlate_ctx *ctx, const struct xport *xport,
bool all, bool is_last_action)
{
if (!xport) {
return;
}
if (all) {
compose_output_action__(ctx, xport->ofp_port, NULL, false,
is_last_action, false);
} else {
compose_output_action(ctx, xport->ofp_port, NULL, is_last_action,
false);
}
}
static void
flood_packets(struct xlate_ctx *ctx, bool all, bool is_last_action)
{
const struct xport *xport, *last = NULL;
/* Use 'last' the keep track of the last output port. */
HMAP_FOR_EACH (xport, ofp_node, &ctx->xbridge->xports) {
if (xport->ofp_port == ctx->xin->flow.in_port.ofp_port) {
continue;
}
if (all || !(xport->config & OFPUTIL_PC_NO_FLOOD)) {
/* 'last' is not the last port, send a packet out, and
* update 'last'. */
flood_packet_to_port(ctx, last, all, false);
last = xport;
}
}
/* Send the packet to the 'last' port. */
flood_packet_to_port(ctx, last, all, is_last_action);
ctx->nf_output_iface = NF_OUT_FLOOD;
}
static void
put_controller_user_action(struct xlate_ctx *ctx,
bool dont_send, bool continuation,
uint32_t recirc_id, int len,
enum ofp_packet_in_reason reason,
uint16_t controller_id)
{
struct user_action_cookie cookie;
memset(&cookie, 0, sizeof cookie);
cookie.type = USER_ACTION_COOKIE_CONTROLLER;
cookie.ofp_in_port = OFPP_NONE,
cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid;
cookie.controller.dont_send = dont_send;
cookie.controller.continuation = continuation;
cookie.controller.reason = reason;
cookie.controller.recirc_id = recirc_id;
put_32aligned_be64(&cookie.controller.rule_cookie, ctx->rule_cookie);
cookie.controller.controller_id = controller_id;
cookie.controller.max_len = len;
odp_port_t odp_port = ofp_port_to_odp_port(ctx->xbridge,
ctx->xin->flow.in_port.ofp_port);
uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port);
odp_put_userspace_action(pid, &cookie, sizeof cookie, ODPP_NONE,
false, ctx->odp_actions, NULL);
}
static void
xlate_controller_action(struct xlate_ctx *ctx, int len,
enum ofp_packet_in_reason reason,
uint16_t controller_id,
uint32_t provider_meter_id,
const uint8_t *userdata, size_t userdata_len)
{
xlate_commit_actions(ctx);
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
/* A packet sent by an action in a table-miss rule is considered an
* explicit table miss. OpenFlow before 1.3 doesn't have that concept so
* it will get translated back to OFPR_ACTION for those versions. */
if (reason == OFPR_ACTION
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
&& ctx->rule && rule_is_table_miss(&ctx->rule->up)) {
reason = OFPR_EXPLICIT_MISS;
}
struct frozen_state state = {
.table_id = ctx->table_id,
.ofproto_uuid = ctx->xbridge->ofproto->uuid,
.stack = ctx->stack.data,
.stack_size = ctx->stack.size,
.mirrors = ctx->mirrors,
.conntracked = ctx->conntracked,
ofproto-dpif: Fix for recirc issue with mpls traffic with dp_hash Fix infinite recirculation loop for MPLS packets sent to dp_hash-based select group Issue: When a MPLS encapsulated packet is received, the MPLS header is removed, a recirculation id assigned and then recirculated into the pipeline. If the flow rules require the packet to be then sent over DP-HASH based select group buckets, the packet has to be recirculated again. However, the same recirculation id was used and this resulted in the packet being repeatedly recirculated until it got dropped because the maximum recirculation limit was hit. Fix: Include the “was_mpls” boolean which indicates whether the packet was MPLS encapsulated when computing the hash. After popping the MPLS header this will result in a different hash value than before and new recirculation id will get generated. DPCTL flows with and without the fix are shown below Without Fix: recirc_id(0x1),dp_hash(0x5194bf18/0xf),in_port(2),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:20, bytes:1960, used:0.329s, actions:1 recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800), ipv4(frag=no), packets:20, bytes:1960, used:0.329s, actions:hash(sym_l4(0)),recirc(0x1) recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x8847), mpls(label=22/0xfffff,tc=0/0,ttl=64/0x0,bos=1/1), packets:20, bytes:2040, used:0.329s, actions:pop_mpls(eth_type=0x800),recirc(0x1) With Fix: recirc_id(0x2),dp_hash(0x5194bf18/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12481, bytes:1223138, used:0.588s, actions:1 recirc_id(0x1),in_port(3),packet_type(ns=0,id=0),eth_type(0x0800), ipv4(frag=no), packets:74431, bytes:7294238, used:0.386s, actions:hash(sym_l4(0)),recirc(0x2) recirc_id(0x2),dp_hash(0xb952470d/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12441, bytes:1219218, used:0.482s, actions:1 recirc_id(0x2),dp_hash(0xeff6ad76/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12385, bytes:1213730, used:0.908s, actions:1 recirc_id(0),in_port(3),packet_type(ns=0,id=0),eth_type(0x8847), mpls(label=22/0xfffff,tc=0/0,ttl=64/0x0,bos=1/1), packets:74431, bytes:7591962, used:0.386s, actions:pop_mpls(eth_type=0x800),recirc(0x1) recirc_id(0x2),dp_hash(0xb6233fbe/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12369, bytes:1212162, used:0.386s, actions:1 recirc_id(0x2),dp_hash(0xa3670459/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:24751, bytes:2425598, used:0.483s, actions:1 Signed-off-by: Surya Rudra <rudrasurya.r@altencalsoftlabs.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-07-19 11:05:19 +05:30
.was_mpls = ctx->was_mpls,
.ofpacts = NULL,
.ofpacts_len = 0,
.action_set = NULL,
.action_set_len = 0,
.userdata = CONST_CAST(uint8_t *, userdata),
.userdata_len = userdata_len,
};
frozen_metadata_from_flow(&state.metadata, &ctx->xin->flow);
uint32_t recirc_id = recirc_alloc_id_ctx(&state);
if (!recirc_id) {
xlate_report_error(ctx, "Failed to allocate recirculation id");
ctx->error = XLATE_NO_RECIRCULATION_CONTEXT;
return;
}
recirc_refs_add(&ctx->xout->recircs, recirc_id);
/* If the controller action didn't request a meter (indicated by a
* 'meter_id' argument other than NX_CTLR_NO_METER), see if one was
* configured through the "controller" virtual meter.
*
* Internally, ovs-vswitchd uses UINT32_MAX to indicate no meter is
* configured. */
uint32_t meter_id;
if (provider_meter_id == UINT32_MAX) {
meter_id = ctx->xbridge->ofproto->up.controller_meter_id;
} else {
meter_id = provider_meter_id;
}
size_t offset;
size_t ac_offset;
if (meter_id != UINT32_MAX) {
/* If controller meter is configured, generate clone(meter, userspace)
* action. */
offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE);
nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY,
UINT32_MAX);
ac_offset = nl_msg_start_nested(ctx->odp_actions,
OVS_SAMPLE_ATTR_ACTIONS);
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id);
}
/* Generate the datapath flows even if we don't send the packet-in
* so that debugging more closely represents normal state. */
bool dont_send = false;
if (!ctx->xin->allow_side_effects && !ctx->xin->xcache) {
dont_send = true;
}
put_controller_user_action(ctx, dont_send, false, recirc_id, len,
reason, controller_id);
if (meter_id != UINT32_MAX) {
nl_msg_end_nested(ctx->odp_actions, ac_offset);
nl_msg_end_nested(ctx->odp_actions, offset);
}
}
/* Creates a frozen state, and allocates a unique recirc id for the given
* state. Returns a non-zero recirc id if it is allocated successfully.
* Returns 0 otherwise.
**/
static uint32_t
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
finish_freezing__(struct xlate_ctx *ctx, uint8_t table)
{
ovs_assert(ctx->freezing);
struct frozen_state state = {
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
.table_id = table,
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
.ofproto_uuid = ctx->xbridge->ofproto->uuid,
.stack = ctx->stack.data,
.stack_size = ctx->stack.size,
.mirrors = ctx->mirrors,
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
.conntracked = ctx->conntracked,
ofproto-dpif: Fix for recirc issue with mpls traffic with dp_hash Fix infinite recirculation loop for MPLS packets sent to dp_hash-based select group Issue: When a MPLS encapsulated packet is received, the MPLS header is removed, a recirculation id assigned and then recirculated into the pipeline. If the flow rules require the packet to be then sent over DP-HASH based select group buckets, the packet has to be recirculated again. However, the same recirculation id was used and this resulted in the packet being repeatedly recirculated until it got dropped because the maximum recirculation limit was hit. Fix: Include the “was_mpls” boolean which indicates whether the packet was MPLS encapsulated when computing the hash. After popping the MPLS header this will result in a different hash value than before and new recirculation id will get generated. DPCTL flows with and without the fix are shown below Without Fix: recirc_id(0x1),dp_hash(0x5194bf18/0xf),in_port(2),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:20, bytes:1960, used:0.329s, actions:1 recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800), ipv4(frag=no), packets:20, bytes:1960, used:0.329s, actions:hash(sym_l4(0)),recirc(0x1) recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x8847), mpls(label=22/0xfffff,tc=0/0,ttl=64/0x0,bos=1/1), packets:20, bytes:2040, used:0.329s, actions:pop_mpls(eth_type=0x800),recirc(0x1) With Fix: recirc_id(0x2),dp_hash(0x5194bf18/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12481, bytes:1223138, used:0.588s, actions:1 recirc_id(0x1),in_port(3),packet_type(ns=0,id=0),eth_type(0x0800), ipv4(frag=no), packets:74431, bytes:7294238, used:0.386s, actions:hash(sym_l4(0)),recirc(0x2) recirc_id(0x2),dp_hash(0xb952470d/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12441, bytes:1219218, used:0.482s, actions:1 recirc_id(0x2),dp_hash(0xeff6ad76/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12385, bytes:1213730, used:0.908s, actions:1 recirc_id(0),in_port(3),packet_type(ns=0,id=0),eth_type(0x8847), mpls(label=22/0xfffff,tc=0/0,ttl=64/0x0,bos=1/1), packets:74431, bytes:7591962, used:0.386s, actions:pop_mpls(eth_type=0x800),recirc(0x1) recirc_id(0x2),dp_hash(0xb6233fbe/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:12369, bytes:1212162, used:0.386s, actions:1 recirc_id(0x2),dp_hash(0xa3670459/0xf),in_port(3),packet_type(ns=0,id=0), eth_type(0x0800),ipv4(frag=no), packets:24751, bytes:2425598, used:0.483s, actions:1 Signed-off-by: Surya Rudra <rudrasurya.r@altencalsoftlabs.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-07-19 11:05:19 +05:30
.was_mpls = ctx->was_mpls,
.xport_uuid = ctx->xin->xport_uuid,
.ofpacts = ctx->frozen_actions.data,
.ofpacts_len = ctx->frozen_actions.size,
.action_set = ctx->action_set.data,
.action_set_len = ctx->action_set.size,
.userdata = ctx->pause ? CONST_CAST(uint8_t *,ctx->pause->userdata)
: NULL,
.userdata_len = ctx->pause ? ctx->pause->userdata_len : 0,
};
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
frozen_metadata_from_flow(&state.metadata, &ctx->xin->flow);
/* Allocate a unique recirc id for the given metadata state in the
* flow. An existing id, with a new reference to the corresponding
* recirculation context, will be returned if possible.
* The life-cycle of this recirc id is managed by associating it
* with the udpif key ('ukey') created for each new datapath flow. */
uint32_t recirc_id = recirc_alloc_id_ctx(&state);
if (!recirc_id) {
xlate_report_error(ctx, "Failed to allocate recirculation id");
ctx->error = XLATE_NO_RECIRCULATION_CONTEXT;
return 0;
}
recirc_refs_add(&ctx->xout->recircs, recirc_id);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
if (ctx->pause) {
if (!ctx->xin->allow_side_effects && !ctx->xin->xcache) {
return 0;
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
}
put_controller_user_action(ctx, false, true, recirc_id,
ctx->pause->max_len,
ctx->pause->reason,
ctx->pause->controller_id);
} else {
if (ctx->recirc_update_dp_hash) {
struct ovs_action_hash *act_hash;
/* Hash action. */
act_hash = nl_msg_put_unspec_uninit(ctx->odp_actions,
OVS_ACTION_ATTR_HASH,
sizeof *act_hash);
ofproto-dpif: Improve dp_hash selection method for select groups The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-05-24 17:28:00 +02:00
act_hash->hash_alg = ctx->dp_hash_alg;
act_hash->hash_basis = ctx->dp_hash_basis;
}
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, recirc_id);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
}
/* Undo changes done by freezing. */
ctx_cancel_freeze(ctx);
return recirc_id;
}
/* Called only when we're freezing. */
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
static void
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
finish_freezing(struct xlate_ctx *ctx)
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
{
xlate_commit_actions(ctx);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
finish_freezing__(ctx, 0);
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
}
/* Fork the pipeline here. The current packet will continue processing the
* current action list. A clone of the current packet will recirculate, skip
* the remainder of the current action list and asynchronously resume pipeline
* processing in 'table' with the current metadata and action set. */
static void
compose_recirculate_and_fork(struct xlate_ctx *ctx, uint8_t table,
const uint16_t zone)
{
uint32_t recirc_id;
ctx->freezing = true;
recirc_id = finish_freezing__(ctx, table);
if (OVS_UNLIKELY(ctx->xin->trace) && recirc_id) {
if (oftrace_add_recirc_node(ctx->xin->recirc_queue,
OFT_RECIRC_CONNTRACK, &ctx->xin->flow,
ctx->ct_nat_action, ctx->xin->packet,
recirc_id, zone)) {
xlate_report(ctx, OFT_DETAIL, "A clone of the packet is forked to "
"recirculate. The forked pipeline will be resumed at "
"table %u.", table);
} else {
xlate_report(ctx, OFT_DETAIL, "Failed to trace the conntrack "
"forked pipeline with recirc_id = %d.", recirc_id);
}
}
}
static void
compose_mpls_push_action(struct xlate_ctx *ctx, struct ofpact_push_mpls *mpls)
{
struct flow *flow = &ctx->xin->flow;
int n;
ovs_assert(eth_type_mpls(mpls->ethertype));
n = flow_count_mpls_labels(flow, ctx->wc);
if (!n) {
xlate_commit_actions(ctx);
} else if (n >= FLOW_MAX_MPLS_LABELS) {
if (ctx->xin->packet != NULL) {
xlate_report_error(ctx, "dropping packet on which an MPLS push "
"action can't be performed as it would have "
"more MPLS LSEs than the %d supported.",
FLOW_MAX_MPLS_LABELS);
}
ctx->error = XLATE_TOO_MANY_MPLS_LABELS;
return;
}
mpls: Fix MPLS restoration after patch port and group bucket. This patch fixes problems with MPLS handling related to patch ports and group buckets. If a group bucket or a peer bridge across a patch port pushes MPLS headers to a non-MPLS packet and outputs, the flow translation after returning from the group bucket or patch port would undo the packet transformations so that the processing could continue with the packet as it was before entering the patch port. There were two problems with this: 1. As part of the first MPLS push on a non-MPLS packet, the flow translation would first clear the L3/4 headers of the 'flow' to mark those fields invalid. Later, when committing 'flow' changes to datapath actions before output, the necessary datapath MPLS actions are created and the corresponding changes updated to the 'base flow'. This was done using the same flow_push_mpls() function that clears the L2/3 headers, so also the 'base flow' L2/3 headers were cleared. Then, when translation returns from a patch port or group bucket, the original 'flow' is restored, now showing no sign of the MPLS labels. Since the 'base flow' now has the MPLS labels, following translations know to issue MPLS POP actions before any output actions. However, as part of checking for changes to IP headers we test that the IP protocol type was not changed. But now the 'base flow's 'nw_proto' field is zero and an assert fail crashes OVS. This is solved by not clearing the L3/4 fields of the 'base flow'. This allows the processing after the patch port to continue with L3/4 fields as if no MPLS was done, after first issuing the necessary MPLS POP actions. 2. IP header updates were done before the MPLS POP actions were issued. This caused incorrect packet output after, e.g., group action or patch port. For example, with actions: group 1234: all bucket=push_mpls,output:LOCAL ip actions=group:1234,dec_ttl,output:LOCAL,output:LOCAL the dec_ttl would only be executed before the last output to LOCAL, since at the time of committing IP changes after the group action the packet was still an MPLS packet. This is solved by checking the dl_type of both 'flow' and 'base flow' and issuing MPLS actions if they can transform the packet from an MPLS packet to a non-MPLS packet. For an IP packet the change in ttl can then be correctly committed before the last two output actions. Two test cases are added to prevent future regressions. Reported-by: Thomas Morin <thomas.morin@orange.com> Suggested-by: Takashi YAMAMOTO <yamamoto@ovn.org> Fixes: 8bfd0fdac ("Enhance userspace support for MPLS, for up to 3 labels.") Fixes: 1b035ef20 ("mpls: Allow l3 and l4 actions to prior to a push_mpls action") Signed-off-by: Jarno Rajahalme <jarno@ovn.org> Acked-by: YAMAMOTO Takashi <yamamoto@ovn.org>
2016-12-01 14:05:24 -08:00
/* Update flow's MPLS stack, and clear L3/4 fields to mark them invalid. */
flow_push_mpls(flow, n, mpls->ethertype, ctx->wc, true);
}
static void
compose_mpls_pop_action(struct xlate_ctx *ctx, ovs_be16 eth_type)
{
struct flow *flow = &ctx->xin->flow;
int n = flow_count_mpls_labels(flow, ctx->wc);
if (flow_pop_mpls(flow, n, eth_type, ctx->wc)) {
if (!eth_type_mpls(eth_type) && ctx->xbridge->support.odp.recirc) {
ctx->was_mpls = true;
}
} else if (n >= FLOW_MAX_MPLS_LABELS) {
if (ctx->xin->packet != NULL) {
xlate_report_error(ctx, "dropping packet on which an "
"MPLS pop action can't be performed as it has "
"more MPLS LSEs than the %d supported.",
FLOW_MAX_MPLS_LABELS);
}
ctx->error = XLATE_TOO_MANY_MPLS_LABELS;
ofpbuf_clear(ctx->odp_actions);
}
}
static bool
compose_dec_ttl(struct xlate_ctx *ctx, struct ofpact_cnt_ids *ids)
{
struct flow *flow = &ctx->xin->flow;
if (!is_ip_any(flow)) {
return false;
}
ctx->wc->masks.nw_ttl = 0xff;
WC_MASK_FIELD(ctx->wc, nw_proto);
if (flow->nw_ttl > 1) {
flow->nw_ttl--;
return false;
} else {
size_t i;
for (i = 0; i < ids->n_controllers; i++) {
xlate_controller_action(ctx, UINT16_MAX, OFPR_INVALID_TTL,
ids->cnt_ids[i], UINT32_MAX, NULL, 0);
}
/* Stop processing for current table. */
xlate_report(ctx, OFT_WARN, "IPv%d decrement TTL exception",
flow->dl_type == htons(ETH_TYPE_IP) ? 4 : 6);
return true;
}
}
static void
compose_set_mpls_label_action(struct xlate_ctx *ctx, ovs_be32 label)
{
if (eth_type_mpls(ctx->xin->flow.dl_type)) {
ctx->wc->masks.mpls_lse[0] |= htonl(MPLS_LABEL_MASK);
set_mpls_lse_label(&ctx->xin->flow.mpls_lse[0], label);
}
}
static void
compose_set_mpls_tc_action(struct xlate_ctx *ctx, uint8_t tc)
{
if (eth_type_mpls(ctx->xin->flow.dl_type)) {
ctx->wc->masks.mpls_lse[0] |= htonl(MPLS_TC_MASK);
set_mpls_lse_tc(&ctx->xin->flow.mpls_lse[0], tc);
}
}
static bool
compose_dec_nsh_ttl_action(struct xlate_ctx *ctx)
{
struct flow *flow = &ctx->xin->flow;
if ((flow->packet_type == htonl(PT_NSH)) ||
(flow->dl_type == htons(ETH_TYPE_NSH))) {
ctx->wc->masks.nsh.ttl = 0xff;
if (flow->nsh.ttl > 1) {
flow->nsh.ttl--;
return false;
} else {
xlate_controller_action(ctx, UINT16_MAX, OFPR_INVALID_TTL,
0, UINT32_MAX, NULL, 0);
}
}
/* Stop processing for current table. */
xlate_report(ctx, OFT_WARN, "NSH decrement TTL exception");
return true;
}
static void
compose_set_mpls_ttl_action(struct xlate_ctx *ctx, uint8_t ttl)
{
if (eth_type_mpls(ctx->xin->flow.dl_type)) {
ctx->wc->masks.mpls_lse[0] |= htonl(MPLS_TTL_MASK);
set_mpls_lse_ttl(&ctx->xin->flow.mpls_lse[0], ttl);
}
}
static bool
compose_dec_mpls_ttl_action(struct xlate_ctx *ctx)
{
struct flow *flow = &ctx->xin->flow;
if (eth_type_mpls(flow->dl_type)) {
uint8_t ttl = mpls_lse_to_ttl(flow->mpls_lse[0]);
ctx->wc->masks.mpls_lse[0] |= htonl(MPLS_TTL_MASK);
if (ttl > 1) {
ttl--;
set_mpls_lse_ttl(&flow->mpls_lse[0], ttl);
return false;
} else {
xlate_controller_action(ctx, UINT16_MAX, OFPR_INVALID_TTL, 0,
UINT32_MAX, NULL, 0);
}
}
/* Stop processing for current table. */
xlate_report(ctx, OFT_WARN, "MPLS decrement TTL exception");
return true;
}
static void
xlate_delete_field(struct xlate_ctx *ctx,
struct flow *flow,
const struct ofpact_delete_field *odf)
{
struct ds s = DS_EMPTY_INITIALIZER;
/* Currently, only tun_metadata is allowed for delete_field action. */
tun_metadata_delete(&flow->tunnel, odf->field);
ds_put_format(&s, "delete %s", odf->field->name);
xlate_report(ctx, OFT_DETAIL, "%s", ds_cstr(&s));
ds_destroy(&s);
}
/* Emits an action that outputs to 'port', within 'ctx'.
*
* 'controller_len' affects only packets sent to an OpenFlow controller. It
* is the maximum number of bytes of the packet to send. UINT16_MAX means to
* send the whole packet (and 0 means to omit the packet entirely).
*
* 'may_packet_in' determines whether the packet may be sent to an OpenFlow
* controller. If it is false, then the packet is never sent to the OpenFlow
* controller.
*
* 'is_last_action' should be true if this output is the last OpenFlow action
* to be processed, which enables certain optimizations.
*
* 'truncate' should be true if the packet to be output is being truncated,
* which suppresses certain optimizations. */
static void
xlate_output_action(struct xlate_ctx *ctx, ofp_port_t port,
uint16_t controller_len, bool may_packet_in,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
bool is_last_action, bool truncate,
bool group_bucket_action)
{
ofp_port_t prev_nf_output_iface = ctx->nf_output_iface;
ctx->nf_output_iface = NF_OUT_DROP;
switch (port) {
case OFPP_IN_PORT:
compose_output_action(ctx, ctx->xin->flow.in_port.ofp_port, NULL,
is_last_action, truncate);
break;
case OFPP_TABLE:
xlate_table_action(ctx, ctx->xin->flow.in_port.ofp_port,
0, may_packet_in, true, false, false,
do_xlate_actions);
break;
case OFPP_NORMAL:
xlate_normal(ctx);
break;
case OFPP_FLOOD:
flood_packets(ctx, false, is_last_action);
break;
case OFPP_ALL:
flood_packets(ctx, true, is_last_action);
break;
case OFPP_CONTROLLER:
xlate_controller_action(ctx, controller_len,
(ctx->in_packet_out ? OFPR_PACKET_OUT
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
: group_bucket_action ? OFPR_GROUP
: ctx->in_action_set ? OFPR_ACTION_SET
: OFPR_ACTION),
0, UINT32_MAX, NULL, 0);
break;
case OFPP_NONE:
break;
case OFPP_LOCAL:
default:
if (port != ctx->xin->flow.in_port.ofp_port) {
compose_output_action(ctx, port, NULL, is_last_action, truncate);
} else {
xlate_report_info(ctx, "skipping output to input port");
}
break;
}
if (prev_nf_output_iface == NF_OUT_FLOOD) {
ctx->nf_output_iface = NF_OUT_FLOOD;
} else if (ctx->nf_output_iface == NF_OUT_DROP) {
ctx->nf_output_iface = prev_nf_output_iface;
} else if (prev_nf_output_iface != NF_OUT_DROP &&
ctx->nf_output_iface != NF_OUT_FLOOD) {
ctx->nf_output_iface = NF_OUT_MULTI;
}
}
static void
xlate_output_reg_action(struct xlate_ctx *ctx,
const struct ofpact_output_reg *or,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
bool is_last_action,
bool group_bucket_action)
{
uint64_t port = mf_get_subfield(&or->src, &ctx->xin->flow);
if (port <= UINT16_MAX) {
union mf_subvalue *value = xmalloc(sizeof *value);
xlate_report(ctx, OFT_DETAIL, "output port is %"PRIu64, port);
memset(value, 0xff, sizeof *value);
mf_write_subfield_flow(&or->src, value, &ctx->wc->masks);
xlate_output_action(ctx, u16_to_ofp(port), or->max_len,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
false, is_last_action, false,
group_bucket_action);
free(value);
} else {
xlate_report(ctx, OFT_WARN, "output port %"PRIu64" is out of range",
port);
}
}
static void
xlate_output_trunc_action(struct xlate_ctx *ctx,
ofp_port_t port, uint32_t max_len,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
bool is_last_action,
bool group_bucket_action)
{
bool support_trunc = ctx->xbridge->support.trunc;
struct ovs_action_trunc *trunc;
char name[OFP_MAX_PORT_NAME_LEN];
switch (port) {
case OFPP_TABLE:
case OFPP_NORMAL:
case OFPP_FLOOD:
case OFPP_ALL:
case OFPP_CONTROLLER:
case OFPP_NONE:
ofputil_port_to_string(port, NULL, name, sizeof name);
xlate_report(ctx, OFT_WARN,
"output_trunc does not support port: %s", name);
break;
case OFPP_LOCAL:
case OFPP_IN_PORT:
default:
if (port != ctx->xin->flow.in_port.ofp_port) {
const struct xport *xport = get_ofp_port(ctx->xbridge, port);
if (xport == NULL || xport->odp_port == ODPP_NONE) {
/* Since truncate happens at its following output action, if
* the output port is a patch port, the behavior is somehow
* unpredictable. For simplicity, disallow this case. */
ofputil_port_to_string(port, NULL, name, sizeof name);
xlate_report_error(ctx, "output_trunc does not support "
"patch port %s", name);
break;
}
trunc = nl_msg_put_unspec_uninit(ctx->odp_actions,
OVS_ACTION_ATTR_TRUNC,
sizeof *trunc);
trunc->max_len = max_len;
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
xlate_output_action(ctx, port, 0, false, is_last_action, true,
group_bucket_action);
if (!support_trunc) {
ctx->xout->slow |= SLOW_ACTION;
}
} else {
xlate_report_info(ctx, "skipping output to input port");
}
break;
}
}
static void
xlate_enqueue_action(struct xlate_ctx *ctx,
const struct ofpact_enqueue *enqueue,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
bool is_last_action,
bool group_bucket_action)
{
ofp_port_t ofp_port = enqueue->port;
uint32_t queue_id = enqueue->queue;
uint32_t flow_priority, priority;
int error;
/* Translate queue to priority. */
error = dpif_queue_to_priority(ctx->xbridge->dpif, queue_id, &priority);
if (error) {
/* Fall back to ordinary output action. */
xlate_output_action(ctx, enqueue->port, 0, false,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
is_last_action, false,
group_bucket_action);
return;
}
/* Check output port. */
if (ofp_port == OFPP_IN_PORT) {
ofp_port = ctx->xin->flow.in_port.ofp_port;
} else if (ofp_port == ctx->xin->flow.in_port.ofp_port) {
return;
}
/* Add datapath actions. */
flow_priority = ctx->xin->flow.skb_priority;
ctx->xin->flow.skb_priority = priority;
compose_output_action(ctx, ofp_port, NULL, is_last_action, false);
ctx->xin->flow.skb_priority = flow_priority;
/* Update NetFlow output port. */
if (ctx->nf_output_iface == NF_OUT_DROP) {
ctx->nf_output_iface = ofp_port;
} else if (ctx->nf_output_iface != NF_OUT_FLOOD) {
ctx->nf_output_iface = NF_OUT_MULTI;
}
}
static void
xlate_set_queue_action(struct xlate_ctx *ctx, uint32_t queue_id)
{
uint32_t skb_priority;
if (!dpif_queue_to_priority(ctx->xbridge->dpif, queue_id, &skb_priority)) {
ctx->xin->flow.skb_priority = skb_priority;
} else {
/* Couldn't translate queue to a priority. Nothing to do. A warning
* has already been logged. */
}
}
static bool
member_enabled_cb(ofp_port_t ofp_port, void *xbridge_)
{
const struct xbridge *xbridge = xbridge_;
struct xport *port;
switch (ofp_port) {
case OFPP_IN_PORT:
case OFPP_TABLE:
case OFPP_NORMAL:
case OFPP_FLOOD:
case OFPP_ALL:
case OFPP_NONE:
return true;
case OFPP_CONTROLLER: /* Not supported by the bundle action. */
return false;
default:
port = get_ofp_port(xbridge, ofp_port);
return port ? port->may_enable : false;
}
}
static void
xlate_bundle_action(struct xlate_ctx *ctx,
const struct ofpact_bundle *bundle,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
bool is_last_action,
bool group_bucket_action)
{
ofp_port_t port;
port = bundle_execute(bundle, &ctx->xin->flow, ctx->wc, member_enabled_cb,
CONST_CAST(struct xbridge *, ctx->xbridge));
if (bundle->dst.field) {
nxm_reg_load(&bundle->dst, ofp_to_u16(port), &ctx->xin->flow, ctx->wc);
xlate_report_subfield(ctx, &bundle->dst);
} else {
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
xlate_output_action(ctx, port, 0, false, is_last_action, false,
group_bucket_action);
}
}
ofproto-dpif-xlate: Cache full flowmod for learning. Caching the results of xlate_learn was previously dependent on the state of the 'may_learn' flag. This meant that if the caller did not specify that this flow may learn, then a learn entry would not be cached. However, the xlate_cache tends to be used on a recurring basis, so failing to cache the learn entry can provide unexpected behaviour later on, particularly in corner cases. Such a corner case occurred previously:- * Revalidation was requested. * A flow with a learn action was dumped. * The flow had no packets. * The flow's corresponding xcache was cleared, and the flow revalidated. * The flow went on to receive packets after the xcache is re-created. In this case, the xcache would be re-created, but would not refresh the timeouts on the learnt flow until the next time it was cleared, even if it received more traffic. This would cause flows to time out sooner than expected. Symptoms of this bug may include unexpected forwarding behaviour or extraneous statistics being attributed to the wrong flow. This patch fixes the issue by caching the entire flow_mod, including actions, upon translating an xlate_learn action. This is used to perform a flow_mod from scratch with the original flow, rather than simply refreshing the rule that was created during the creation of the xcache. Bug #1252997. Reported-by: Scott Hendricks <shendricks@vmware.com> Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Alex Wang <alexw@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-06-03 20:44:35 +12:00
static void
xlate_learn_action(struct xlate_ctx *ctx, const struct ofpact_learn *learn)
{
learn_mask(learn, ctx->wc);
if (ctx->xin->xcache || ctx->xin->allow_side_effects) {
ofproto-dpif-xlate: Cache full flowmod for learning. Caching the results of xlate_learn was previously dependent on the state of the 'may_learn' flag. This meant that if the caller did not specify that this flow may learn, then a learn entry would not be cached. However, the xlate_cache tends to be used on a recurring basis, so failing to cache the learn entry can provide unexpected behaviour later on, particularly in corner cases. Such a corner case occurred previously:- * Revalidation was requested. * A flow with a learn action was dumped. * The flow had no packets. * The flow's corresponding xcache was cleared, and the flow revalidated. * The flow went on to receive packets after the xcache is re-created. In this case, the xcache would be re-created, but would not refresh the timeouts on the learnt flow until the next time it was cleared, even if it received more traffic. This would cause flows to time out sooner than expected. Symptoms of this bug may include unexpected forwarding behaviour or extraneous statistics being attributed to the wrong flow. This patch fixes the issue by caching the entire flow_mod, including actions, upon translating an xlate_learn action. This is used to perform a flow_mod from scratch with the original flow, rather than simply refreshing the rule that was created during the creation of the xcache. Bug #1252997. Reported-by: Scott Hendricks <shendricks@vmware.com> Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Alex Wang <alexw@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-06-03 20:44:35 +12:00
uint64_t ofpacts_stub[1024 / 8];
struct ofputil_flow_mod fm;
struct ofproto_flow_mod ofm__, *ofm;
ofproto-dpif-xlate: Cache full flowmod for learning. Caching the results of xlate_learn was previously dependent on the state of the 'may_learn' flag. This meant that if the caller did not specify that this flow may learn, then a learn entry would not be cached. However, the xlate_cache tends to be used on a recurring basis, so failing to cache the learn entry can provide unexpected behaviour later on, particularly in corner cases. Such a corner case occurred previously:- * Revalidation was requested. * A flow with a learn action was dumped. * The flow had no packets. * The flow's corresponding xcache was cleared, and the flow revalidated. * The flow went on to receive packets after the xcache is re-created. In this case, the xcache would be re-created, but would not refresh the timeouts on the learnt flow until the next time it was cleared, even if it received more traffic. This would cause flows to time out sooner than expected. Symptoms of this bug may include unexpected forwarding behaviour or extraneous statistics being attributed to the wrong flow. This patch fixes the issue by caching the entire flow_mod, including actions, upon translating an xlate_learn action. This is used to perform a flow_mod from scratch with the original flow, rather than simply refreshing the rule that was created during the creation of the xcache. Bug #1252997. Reported-by: Scott Hendricks <shendricks@vmware.com> Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Alex Wang <alexw@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-06-03 20:44:35 +12:00
struct ofpbuf ofpacts;
enum ofperr error;
if (ctx->xin->xcache) {
ofm = xmalloc(sizeof *ofm);
} else {
ofm = &ofm__;
}
ofproto-dpif-xlate: Cache full flowmod for learning. Caching the results of xlate_learn was previously dependent on the state of the 'may_learn' flag. This meant that if the caller did not specify that this flow may learn, then a learn entry would not be cached. However, the xlate_cache tends to be used on a recurring basis, so failing to cache the learn entry can provide unexpected behaviour later on, particularly in corner cases. Such a corner case occurred previously:- * Revalidation was requested. * A flow with a learn action was dumped. * The flow had no packets. * The flow's corresponding xcache was cleared, and the flow revalidated. * The flow went on to receive packets after the xcache is re-created. In this case, the xcache would be re-created, but would not refresh the timeouts on the learnt flow until the next time it was cleared, even if it received more traffic. This would cause flows to time out sooner than expected. Symptoms of this bug may include unexpected forwarding behaviour or extraneous statistics being attributed to the wrong flow. This patch fixes the issue by caching the entire flow_mod, including actions, upon translating an xlate_learn action. This is used to perform a flow_mod from scratch with the original flow, rather than simply refreshing the rule that was created during the creation of the xcache. Bug #1252997. Reported-by: Scott Hendricks <shendricks@vmware.com> Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Alex Wang <alexw@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-06-03 20:44:35 +12:00
ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
learn_execute(learn, &ctx->xin->flow, &fm, &ofpacts);
if (OVS_UNLIKELY(ctx->xin->trace)) {
struct ds s = DS_EMPTY_INITIALIZER;
ds_put_format(&s, "table=%"PRIu8" ", fm.table_id);
minimatch_format(&fm.match,
ofproto_get_tun_tab(&ctx->xin->ofproto->up),
NULL, &s, OFP_DEFAULT_PRIORITY);
ds_chomp(&s, ' ');
ds_put_format(&s, " priority=%d", fm.priority);
if (fm.new_cookie) {
ds_put_format(&s, " cookie=%#"PRIx64, ntohll(fm.new_cookie));
}
if (fm.idle_timeout != OFP_FLOW_PERMANENT) {
ds_put_format(&s, " idle=%"PRIu16, fm.idle_timeout);
}
if (fm.hard_timeout != OFP_FLOW_PERMANENT) {
ds_put_format(&s, " hard=%"PRIu16, fm.hard_timeout);
}
if (fm.flags & NX_LEARN_F_SEND_FLOW_REM) {
ds_put_cstr(&s, " send_flow_rem");
}
ds_put_cstr(&s, " actions=");
struct ofpact_format_params fp = { .s = &s };
ofpacts_format(fm.ofpacts, fm.ofpacts_len, &fp);
xlate_report(ctx, OFT_DETAIL, "%s", ds_cstr(&s));
ds_destroy(&s);
}
error = ofproto_dpif_flow_mod_init_for_learn(ctx->xbridge->ofproto,
&fm, ofm);
ofproto-dpif-xlate: Cache full flowmod for learning. Caching the results of xlate_learn was previously dependent on the state of the 'may_learn' flag. This meant that if the caller did not specify that this flow may learn, then a learn entry would not be cached. However, the xlate_cache tends to be used on a recurring basis, so failing to cache the learn entry can provide unexpected behaviour later on, particularly in corner cases. Such a corner case occurred previously:- * Revalidation was requested. * A flow with a learn action was dumped. * The flow had no packets. * The flow's corresponding xcache was cleared, and the flow revalidated. * The flow went on to receive packets after the xcache is re-created. In this case, the xcache would be re-created, but would not refresh the timeouts on the learnt flow until the next time it was cleared, even if it received more traffic. This would cause flows to time out sooner than expected. Symptoms of this bug may include unexpected forwarding behaviour or extraneous statistics being attributed to the wrong flow. This patch fixes the issue by caching the entire flow_mod, including actions, upon translating an xlate_learn action. This is used to perform a flow_mod from scratch with the original flow, rather than simply refreshing the rule that was created during the creation of the xcache. Bug #1252997. Reported-by: Scott Hendricks <shendricks@vmware.com> Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Alex Wang <alexw@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-06-03 20:44:35 +12:00
ofpbuf_uninit(&ofpacts);
if (!error) {
bool success = true;
if (ctx->xin->allow_side_effects) {
error = ofproto_flow_mod_learn(ofm, ctx->xin->xcache != NULL,
learn->limit, &success);
} else if (learn->limit) {
if (!ofm->temp_rule
|| ofm->temp_rule->state != RULE_INSERTED) {
/* The learned rule expired and there are no packets, so
* we cannot learn again. Since the translated actions
* depend on the result of learning, we tell the caller
* that there's no point in caching this result. */
ctx->xout->avoid_caching = true;
}
}
if (learn->flags & NX_LEARN_F_WRITE_RESULT) {
nxm_reg_load(&learn->result_dst, success ? 1 : 0,
&ctx->xin->flow, ctx->wc);
xlate_report_subfield(ctx, &learn->result_dst);
}
if (success && ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_LEARN);
entry->learn.ofm = ofm;
entry->learn.limit = learn->limit;
ofm = NULL;
} else {
ofproto_flow_mod_uninit(ofm);
}
if (OVS_UNLIKELY(ctx->xin->trace && !success)) {
xlate_report(ctx, OFT_DETAIL, "Limit exceeded, learn failed");
}
}
if (ofm != &ofm__) {
free(ofm);
}
if (error) {
xlate_report_error(ctx, "LEARN action execution failed (%s).",
ofperr_to_string(error));
}
minimatch_destroy(&fm.match);
} else {
xlate_report(ctx, OFT_WARN,
"suppressing side effects, so learn action ignored");
}
}
static void
xlate_fin_timeout__(struct rule_dpif *rule, uint16_t tcp_flags,
uint16_t idle_timeout, uint16_t hard_timeout)
{
if (tcp_flags & (TCP_FIN | TCP_RST)) {
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
ofproto_rule_reduce_timeouts(&rule->up, idle_timeout, hard_timeout);
}
}
static void
xlate_fin_timeout(struct xlate_ctx *ctx,
const struct ofpact_fin_timeout *oft)
{
if (ctx->rule) {
if (ctx->xin->allow_side_effects) {
xlate_fin_timeout__(ctx->rule, ctx->xin->tcp_flags,
oft->fin_idle_timeout, oft->fin_hard_timeout);
}
if (ctx->xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_FIN_TIMEOUT);
/* XC_RULE already holds a reference on the rule, none is taken
* here. */
entry->fin.rule = ctx->rule;
entry->fin.idle = oft->fin_idle_timeout;
entry->fin.hard = oft->fin_hard_timeout;
}
}
}
static void
xlate_sample_action(struct xlate_ctx *ctx,
const struct ofpact_sample *os)
{
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
odp_port_t output_odp_port = ODPP_NONE;
odp_port_t tunnel_out_port = ODPP_NONE;
struct dpif_ipfix *ipfix = ctx->xbridge->ipfix;
bool emit_set_tunnel = false;
if (!ipfix) {
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
return;
}
/* Scale the probability from 16-bit to 32-bit while representing
* the same percentage. */
uint32_t probability =
((uint32_t) os->probability << 16) | os->probability;
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
/* If ofp_port in flow sample action is equel to ofp_port,
* this sample action is a input port action. */
if (os->sampling_port != OFPP_NONE &&
os->sampling_port != ctx->xin->flow.in_port.ofp_port) {
output_odp_port = ofp_port_to_odp_port(ctx->xbridge,
os->sampling_port);
if (output_odp_port == ODPP_NONE) {
xlate_report_error(ctx, "can't use unknown port %d in flow sample "
"action", os->sampling_port);
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
return;
}
if (dpif_ipfix_get_flow_exporter_tunnel_sampling(ipfix,
os->collector_set_id)
&& dpif_ipfix_is_tunnel_port(ipfix, output_odp_port)) {
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
tunnel_out_port = output_odp_port;
emit_set_tunnel = true;
}
}
xlate_commit_actions(ctx);
/* If 'emit_set_tunnel', sample(sampling_port=1) would translate
* into datapath sample action set(tunnel(...)), sample(...) and
* it is used for sampling egress tunnel information. */
if (emit_set_tunnel) {
const struct xport *xport = get_ofp_port(ctx->xbridge,
os->sampling_port);
if (xport && xport->is_tunnel) {
struct flow *flow = &ctx->xin->flow;
tnl_port_send(xport->ofport, flow, ctx->wc);
if (!ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) {
struct flow_tnl *flow_tnl;
const char *tnl_type;
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
flow_tnl = xmemdup(&flow->tunnel, sizeof *flow_tnl);
tnl_type = tnl_port_get_type(xport->ofport);
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
commit_odp_tunnel_action(flow, &ctx->base_flow,
ctx->odp_actions, tnl_type);
flow->tunnel = *flow_tnl;
free(flow_tnl);
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
}
} else {
xlate_report_error(ctx,
"sampling_port:%d should be a tunnel port.",
os->sampling_port);
ipfix: Support tunnel information for Flow IPFIX. Add support to export tunnel information for flow-based IPFIX. The original steps to configure flow level IPFIX: 1) Create a new record in Flow_Sample_Collector_Set table: 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' 2) Add IPFIX configuration which is referred by corresponding row in Flow_Sample_Collector_Set table: 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX targets=\"IP:4739\" obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 cache_max_flows=13' 3) Add sample action to the flows: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1, obs_domain_id=123,obs_point_id=456')',output:3' NXAST_SAMPLE action was used in step 3. In order to support exporting tunnel information, the NXAST_SAMPLE2 action was added and with NXAST_SAMPLE2 action in this patch, the step 3 should be configured like below: 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' 'sampling_port' can be equal to ingress port or one of egress ports. If sampling port is equal to output port and the output port is a tunnel port, OVS_USERSPACE_ATTR_EGRESS_TUN_PORT will be set in the datapath flow sample action. When flow sample action upcall happens, tunnel information will be retrieved from the datapath and then IPFIX can export egress tunnel port information. If samping_port=65535 (OFPP_NONE), flow-based IPFIX will keep the same behavior as before. This patch mainly do three tasks: 1) Add a new flow sample action NXAST_SAMPLE2 to support exporting tunnel information. NXAST_SAMPLE2 action has a new added field 'sampling_port'. 2) Use 'other_configure: enable-tunnel-sampling' to enable or disable exporting tunnel information. 3) If 'sampling_port' is equal to output port and output port is a tunnel port, the translation of OpenFlow "sample" action should first emit set(tunnel(...)), then the sample action itself. It makes sure the egress tunnel information can be sampled. 4) Add a test of flow-based IPFIX for tunnel set. How to test flow-based IPFIX: 1) Setup a test environment with two Linux host with Docker supported 2) Create a Docker container and a GRE tunnel port on each host 3) Use ovs-docker to add the container on the bridge 4) Listen on port 4739 on the collector machine and use wireshark to filter 'cflow' packets. 5) Configure flow-based IPFIX: - 'ovs-vsctl -- create Flow_Sample_Collector_Set id=1 bridge="Bridge UUID"' - 'ovs-vsctl -- set Flow_Sample_Collector_Set "Flow_Sample_Collector_Set UUID" ipfix=@i -- --id=@i create IPFIX \ targets=\"IP:4739\" cache_active_timeout=60 cache_max_flows=13 \ other_config:enable-tunnel-sampling=true' - 'ovs-ofctl add-flow mybridge in_port=1, actions=sample'('probability=65535,collector_set_id=1,obs_domain_id=123, obs_point_id=456,sampling_port=3')',output:3' Note: The in-port is container port. The output port and sampling_port are both open flow port and the output port is a GRE tunnel port. 6) Ping from the container whose host enabled flow-based IPFIX. 7) Get the IPFIX template pakcets and IPFIX information packets. Signed-off-by: Benli Ye <daniely@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2016-06-14 16:53:34 +08:00
}
}
ofproto-dpif: Fix using uninitialised memory in user_action_cookie. Designated initializers are not suitable for initializing non-packed structures and unions which are subjects for comparison by memcmp(). Whole memory for 'struct user_action_cookie' must be explicitly cleared before using because it will be copied with memcpy and later compared by memcmp in ofpbuf_equal(). Few issues found be valgrind: Thread 13 revalidator11: Conditional jump or move depends on uninitialised value(s) at 0x4C35D96: __memcmp_sse4_1 (in vgpreload_memcheck.so) by 0x9D4404: ofpbuf_equal (ofpbuf.h:273) by 0x9D4404: revalidate_ukey__ (ofproto-dpif-upcall.c:2219) by 0x9D4404: revalidate_ukey (ofproto-dpif-upcall.c:2286) by 0x9D62AC: revalidate (ofproto-dpif-upcall.c:2685) by 0x9D62AC: udpif_revalidator (ofproto-dpif-upcall.c:942) by 0xA9C732: ovsthread_wrapper (ovs-thread.c:383) by 0x5FF86DA: start_thread (pthread_create.c:463) by 0x6AF488E: clone (clone.S:95) Uninitialised value was created by a stack allocation at 0x9D4450: compose_slow_path (ofproto-dpif-upcall.c:1062) Thread 11 revalidator16: Conditional jump or move depends on uninitialised value(s) at 0x4C35D96: __memcmp_sse4_1 (in vgpreload_memcheck.so) by 0x9D4404: ofpbuf_equal (ofpbuf.h:273) by 0x9D4404: revalidate_ukey__ (ofproto-dpif-upcall.c:2220) by 0x9D4404: revalidate_ukey (ofproto-dpif-upcall.c:2287) by 0x9D62BC: revalidate (ofproto-dpif-upcall.c:2686) by 0x9D62BC: udpif_revalidator (ofproto-dpif-upcall.c:942) by 0xA9C6D2: ovsthread_wrapper (ovs-thread.c:383) by 0x5FF86DA: start_thread (pthread_create.c:463) by 0x6AF488E: clone (clone.S:95) Uninitialised value was created by a stack allocation at 0x9DC4E0: compose_sflow_action (ofproto-dpif-xlate.c:3211) The struct was never marked as 'packed', however it was manually adjusted to be so in practice. Old IPFIX related commit first made the structure non-contiguous. Commit 8de6ff3ea864 ("ofproto-dpif: Use a fixed size userspace cookie.") added uninitialized parts of the additional union space and the next one introduced new holes between structure fields for all cases. CC: Justin Pettit <jpettit@ovn.org> Fixes: 8b7ea2d48033 ("Extend OVS IPFIX exporter to export tunnel headers") Fixes: 8de6ff3ea864 ("ofproto-dpif: Use a fixed size userspace cookie.") Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user action cookie.") Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Acked-by: Ben Pfaff <blp@ovn.org>
2019-07-25 18:11:13 +03:00
struct user_action_cookie cookie;
memset(&cookie, 0, sizeof cookie);
cookie.type = USER_ACTION_COOKIE_FLOW_SAMPLE;
cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port;
cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid;
cookie.flow_sample.probability = os->probability;
cookie.flow_sample.collector_set_id = os->collector_set_id;
cookie.flow_sample.obs_domain_id = os->obs_domain_id;
cookie.flow_sample.obs_point_id = os->obs_point_id;
cookie.flow_sample.output_odp_port = output_odp_port;
cookie.flow_sample.direction = os->direction;
compose_sample_action(ctx, probability, &cookie, tunnel_out_port, false);
}
/* Determine if an datapath action translated from the openflow action
* can be reversed by another datapath action.
*
* Openflow actions that do not emit datapath actions are trivially
* reversible. Reversiblity of other actions depends on nature of
* action and their translation. */
static bool
reversible_actions(const struct ofpact *ofpacts, size_t ofpacts_len)
{
const struct ofpact *a;
OFPACT_FOR_EACH (a, ofpacts, ofpacts_len) {
switch (a->type) {
case OFPACT_BUNDLE:
case OFPACT_CLEAR_ACTIONS:
case OFPACT_CLONE:
case OFPACT_CONJUNCTION:
case OFPACT_CONTROLLER:
case OFPACT_CT_CLEAR:
case OFPACT_DEBUG_RECIRC:
case OFPACT_DEBUG_SLOW:
case OFPACT_DEC_MPLS_TTL:
case OFPACT_DEC_TTL:
case OFPACT_ENQUEUE:
case OFPACT_EXIT:
case OFPACT_FIN_TIMEOUT:
case OFPACT_GOTO_TABLE:
case OFPACT_GROUP:
case OFPACT_LEARN:
case OFPACT_MULTIPATH:
case OFPACT_NOTE:
case OFPACT_OUTPUT:
case OFPACT_OUTPUT_REG:
case OFPACT_POP_MPLS:
case OFPACT_POP_QUEUE:
case OFPACT_PUSH_MPLS:
case OFPACT_PUSH_VLAN:
case OFPACT_REG_MOVE:
case OFPACT_RESUBMIT:
case OFPACT_SAMPLE:
case OFPACT_SET_ETH_DST:
case OFPACT_SET_ETH_SRC:
case OFPACT_SET_FIELD:
case OFPACT_SET_IP_DSCP:
case OFPACT_SET_IP_ECN:
case OFPACT_SET_IP_TTL:
case OFPACT_SET_IPV4_DST:
case OFPACT_SET_IPV4_SRC:
case OFPACT_SET_L4_DST_PORT:
case OFPACT_SET_L4_SRC_PORT:
case OFPACT_SET_MPLS_LABEL:
case OFPACT_SET_MPLS_TC:
case OFPACT_SET_MPLS_TTL:
case OFPACT_SET_QUEUE:
case OFPACT_SET_TUNNEL:
case OFPACT_SET_VLAN_PCP:
case OFPACT_SET_VLAN_VID:
case OFPACT_STACK_POP:
case OFPACT_STACK_PUSH:
case OFPACT_STRIP_VLAN:
case OFPACT_UNROLL_XLATE:
case OFPACT_WRITE_ACTIONS:
case OFPACT_WRITE_METADATA:
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
case OFPACT_CHECK_PKT_LARGER:
case OFPACT_DELETE_FIELD:
break;
case OFPACT_CT:
case OFPACT_METER:
case OFPACT_NAT:
case OFPACT_OUTPUT_TRUNC:
case OFPACT_ENCAP:
case OFPACT_DECAP:
case OFPACT_DEC_NSH_TTL:
return false;
}
}
return true;
}
static void
clone_xlate_actions(const struct ofpact *actions, size_t actions_len,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
struct xlate_ctx *ctx, bool is_last_action,
bool group_bucket_action OVS_UNUSED)
{
struct xretained_state *retained_state;
size_t offset, ac_offset;
retained_state = xretain_state_save(ctx);
if (reversible_actions(actions, actions_len) || is_last_action) {
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
do_xlate_actions(actions, actions_len, ctx, is_last_action, false);
if (!ctx->freezing) {
xlate_action_set(ctx);
}
if (ctx->freezing) {
finish_freezing(ctx);
}
goto xlate_done;
}
/* Commit datapath actions before emitting the clone action to
* avoid emitting those actions twice. Once inside
* the clone, another time for the action after clone. */
xlate_commit_actions(ctx);
xretain_base_flow_save(ctx, retained_state);
bool old_was_mpls = ctx->was_mpls;
bool old_conntracked = ctx->conntracked;
/* The actions are not reversible, a datapath clone action is
* required to encode the translation. Select the clone action
* based on datapath capabilities. */
if (ctx->xbridge->support.clone) { /* Use clone action */
/* Use clone action as datapath clone. */
offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_CLONE);
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
do_xlate_actions(actions, actions_len, ctx, true, false);
if (!ctx->freezing) {
xlate_action_set(ctx);
}
if (ctx->freezing) {
finish_freezing(ctx);
}
nl_msg_end_non_empty_nested(ctx->odp_actions, offset);
goto dp_clone_done;
}
if (ctx->xbridge->support.sample_nesting > 3) {
/* Use sample action as datapath clone. */
offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE);
ac_offset = nl_msg_start_nested(ctx->odp_actions,
OVS_SAMPLE_ATTR_ACTIONS);
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
do_xlate_actions(actions, actions_len, ctx, true, false);
if (!ctx->freezing) {
xlate_action_set(ctx);
}
if (ctx->freezing) {
finish_freezing(ctx);
}
if (nl_msg_end_non_empty_nested(ctx->odp_actions, ac_offset)) {
nl_msg_cancel_nested(ctx->odp_actions, offset);
} else {
nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY,
UINT32_MAX); /* 100% probability. */
nl_msg_end_nested(ctx->odp_actions, offset);
}
goto dp_clone_done;
}
/* Datapath does not support clone, skip xlate 'oc' and
* report an error */
xlate_report_error(ctx, "Failed to compose clone action");
dp_clone_done:
/* The clone's conntrack execution should have no effect on the original
* packet. */
ctx->conntracked = old_conntracked;
/* Popping MPLS from the clone should have no effect on the original
* packet. */
ctx->was_mpls = old_was_mpls;
/* Restore the 'base_flow' for the next action. */
xretain_base_flow_restore(ctx, retained_state);
xlate_done:
xretain_state_restore_and_free(ctx, retained_state);
}
static void
compose_clone(struct xlate_ctx *ctx, const struct ofpact_nest *oc,
bool is_last_action)
{
size_t oc_actions_len = ofpact_nest_get_action_len(oc);
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
clone_xlate_actions(oc->actions, oc_actions_len, ctx, is_last_action,
false);
}
static void
xlate_meter_action(struct xlate_ctx *ctx, const struct ofpact_meter *meter)
{
if (meter->provider_meter_id != UINT32_MAX) {
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER,
meter->provider_meter_id);
}
}
static bool
may_receive(const struct xport *xport, struct xlate_ctx *ctx)
{
if (xport->config & (is_stp(&ctx->xin->flow)
? OFPUTIL_PC_NO_RECV_STP
: OFPUTIL_PC_NO_RECV)) {
return false;
}
/* Only drop packets here if both forwarding and learning are
* disabled. If just learning is enabled, we need to have
* OFPP_NORMAL and the learning action have a look at the packet
* before we can drop it. */
if ((!xport_stp_forward_state(xport) && !xport_stp_learn_state(xport)) ||
(!xport_rstp_forward_state(xport) && !xport_rstp_learn_state(xport))) {
return false;
}
return true;
}
static void
xlate_write_actions__(struct xlate_ctx *ctx,
const struct ofpact *ofpacts, size_t ofpacts_len)
{
/* Maintain actset_output depending on the contents of the action set:
*
* - OFPP_UNSET, if there is no "output" action.
*
* - The output port, if there is an "output" action and no "group"
* action.
*
* - OFPP_UNSET, if there is a "group" action.
*/
if (!ctx->action_set_has_group) {
const struct ofpact *a;
OFPACT_FOR_EACH (a, ofpacts, ofpacts_len) {
if (a->type == OFPACT_OUTPUT) {
ctx->xin->flow.actset_output = ofpact_get_OUTPUT(a)->port;
} else if (a->type == OFPACT_GROUP) {
ctx->xin->flow.actset_output = OFPP_UNSET;
ctx->action_set_has_group = true;
break;
}
}
}
ofpbuf_put(&ctx->action_set, ofpacts, ofpacts_len);
}
static void
xlate_write_actions(struct xlate_ctx *ctx, const struct ofpact_nest *a)
{
xlate_write_actions__(ctx, a->actions, ofpact_nest_get_action_len(a));
}
static void
xlate_action_set(struct xlate_ctx *ctx)
{
uint64_t action_list_stub[1024 / 8];
struct ofpbuf action_list = OFPBUF_STUB_INITIALIZER(action_list_stub);
ofpacts_execute_action_set(&action_list, &ctx->action_set);
/* Clear the action set, as it is not needed any more. */
ofpbuf_clear(&ctx->action_set);
if (action_list.size) {
ctx->in_action_set = true;
struct ovs_list *old_trace = ctx->xin->trace;
ctx->xin->trace = xlate_report(ctx, OFT_TABLE,
"--. Executing action set:");
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
do_xlate_actions(action_list.data, action_list.size, ctx, true, false);
ctx->xin->trace = old_trace;
ctx->in_action_set = false;
}
ofpbuf_uninit(&action_list);
}
static void
freeze_put_unroll_xlate(struct xlate_ctx *ctx)
{
struct ofpact_unroll_xlate *unroll = ctx->frozen_actions.header;
/* Restore the table_id and rule cookie for a potential PACKET
* IN if needed. */
if (!unroll ||
(ctx->table_id != unroll->rule_table_id
|| ctx->rule_cookie != unroll->rule_cookie)) {
unroll = ofpact_put_UNROLL_XLATE(&ctx->frozen_actions);
unroll->rule_table_id = ctx->table_id;
unroll->rule_cookie = ctx->rule_cookie;
ctx->frozen_actions.header = unroll;
}
}
/* Copy actions 'a' through 'end' to ctx->frozen_actions, which will be
* executed after thawing. Inserts an UNROLL_XLATE action, if none is already
* present, before any action that may depend on the current table ID or flow
* cookie. */
static void
freeze_unroll_actions(const struct ofpact *a, const struct ofpact *end,
struct xlate_ctx *ctx)
{
for (; a < end; a = ofpact_next(a)) {
switch (a->type) {
case OFPACT_OUTPUT_REG:
case OFPACT_OUTPUT_TRUNC:
case OFPACT_GROUP:
case OFPACT_OUTPUT:
case OFPACT_CONTROLLER:
case OFPACT_DEC_MPLS_TTL:
case OFPACT_DEC_NSH_TTL:
case OFPACT_DEC_TTL:
/* These actions may generate asynchronous messages, which include
* table ID and flow cookie information. */
freeze_put_unroll_xlate(ctx);
break;
case OFPACT_RESUBMIT:
if (ofpact_get_RESUBMIT(a)->table_id == 0xff) {
/* This resubmit action is relative to the current table, so we
* need to track what table that is.*/
freeze_put_unroll_xlate(ctx);
}
break;
ofproto-dpif-xlate: Fix continuations with OF instructions in OF1.1+. Open vSwitch supports OpenFlow "instructions", which were introduced in OpenFlow 1.1 and act like restricted kinds of actions that can only appear in a particular order and particular circumstances. OVS did not support two of these instructions, "write_metadata" and "goto_table", properly in the case where they appeared in a flow that needed to be frozen for continuations. Both of these instructions had the problem that they couldn't be properly serialized into the stream of actions, because they're not actions. This commit fixes that problem in freeze_unroll_actions() by converting them into equivalent actions for serialization. goto_table had the additional problem that it was being serialized to the frozen stream even after it had been executed. This was already properly handled in do_xlate_actions() for resubmit, which is almost equivalent to goto_table, so this commit applies the same fix to goto_table. (The commit removes an assertion from the goto_table implementation, but there wasn't any real value in that assertion and I thought the code looked cleaner without it.) This commit adds tests that would have found these bugs. This includes adding a variant of each continuation test that uses OF1.3 for monitor/resume (which is necessary to trigger these bugs) plus specific tests for continuations with goto_table and write_metadata. It also improves the continuation test infrastructure to add more detail on the problem if a test fails. Signed-off-by: Ben Pfaff <blp@ovn.org> Reported-by: Grayson Wu <wgrayson@vmware.com> Reported-at: https://github.com/openvswitch/ovs-issues/issues/213 Discussed-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/386166.html Acked-by: Ilya Maximets <i.maximets@ovn.org>
2021-07-07 11:51:50 -07:00
/* From an OpenFlow point of view, goto_table and write_metadata are
* instructions, not actions. This means that to use them, we'd have
* to reformulate the actions as instructions, which is possible, and
* we'd have slot them into the frozen actions in a specific order,
* which doesn't seem practical. Instead, we translate these
* instructions into equivalent actions. */
case OFPACT_GOTO_TABLE: {
struct ofpact_resubmit *resubmit
= ofpact_put_RESUBMIT(&ctx->frozen_actions);
resubmit->in_port = OFPP_IN_PORT;
resubmit->table_id = ofpact_get_GOTO_TABLE(a)->table_id;
resubmit->with_ct_orig = false;
}
continue;
case OFPACT_WRITE_METADATA: {
const struct ofpact_metadata *md = ofpact_get_WRITE_METADATA(a);
const struct mf_field *mf = mf_from_id(MFF_METADATA);
ovs_assert(mf->n_bytes == sizeof md->metadata);
ovs_assert(mf->n_bytes == sizeof md->mask);
ofpact_put_set_field(&ctx->frozen_actions, mf,
&md->metadata, &md->mask);
}
continue;
case OFPACT_SET_TUNNEL:
case OFPACT_REG_MOVE:
case OFPACT_SET_FIELD:
case OFPACT_STACK_PUSH:
case OFPACT_STACK_POP:
case OFPACT_LEARN:
case OFPACT_ENQUEUE:
case OFPACT_SET_VLAN_VID:
case OFPACT_SET_VLAN_PCP:
case OFPACT_STRIP_VLAN:
case OFPACT_PUSH_VLAN:
case OFPACT_SET_ETH_SRC:
case OFPACT_SET_ETH_DST:
case OFPACT_SET_IPV4_SRC:
case OFPACT_SET_IPV4_DST:
case OFPACT_SET_IP_DSCP:
case OFPACT_SET_IP_ECN:
case OFPACT_SET_IP_TTL:
case OFPACT_SET_L4_SRC_PORT:
case OFPACT_SET_L4_DST_PORT:
case OFPACT_SET_QUEUE:
case OFPACT_POP_QUEUE:
case OFPACT_PUSH_MPLS:
case OFPACT_POP_MPLS:
case OFPACT_SET_MPLS_LABEL:
case OFPACT_SET_MPLS_TC:
case OFPACT_SET_MPLS_TTL:
case OFPACT_MULTIPATH:
case OFPACT_BUNDLE:
case OFPACT_EXIT:
case OFPACT_UNROLL_XLATE:
case OFPACT_FIN_TIMEOUT:
case OFPACT_CLEAR_ACTIONS:
case OFPACT_WRITE_ACTIONS:
case OFPACT_METER:
case OFPACT_SAMPLE:
case OFPACT_CLONE:
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
case OFPACT_ENCAP:
case OFPACT_DECAP:
case OFPACT_DEBUG_RECIRC:
case OFPACT_DEBUG_SLOW:
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
case OFPACT_CT:
case OFPACT_CT_CLEAR:
case OFPACT_NAT:
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
case OFPACT_CHECK_PKT_LARGER:
case OFPACT_DELETE_FIELD:
/* These may not generate PACKET INs. */
break;
case OFPACT_NOTE:
case OFPACT_CONJUNCTION:
/* These need not be copied for restoration. */
continue;
}
/* Copy the action over. */
ofpbuf_put(&ctx->frozen_actions, a, OFPACT_ALIGN(a->len));
}
}
static void
put_ct_mark(const struct flow *flow, struct ofpbuf *odp_actions,
struct flow_wildcards *wc)
{
if (wc->masks.ct_mark) {
struct {
uint32_t key;
uint32_t mask;
} *odp_ct_mark;
odp_ct_mark = nl_msg_put_unspec_uninit(odp_actions, OVS_CT_ATTR_MARK,
sizeof(*odp_ct_mark));
odp_ct_mark->key = flow->ct_mark & wc->masks.ct_mark;
odp_ct_mark->mask = wc->masks.ct_mark;
}
}
static void
put_ct_label(const struct flow *flow, struct ofpbuf *odp_actions,
struct flow_wildcards *wc)
{
if (!ovs_u128_is_zero(wc->masks.ct_label)) {
struct {
ovs_u128 key;
ovs_u128 mask;
} odp_ct_label;
odp_ct_label.key = ovs_u128_and(flow->ct_label, wc->masks.ct_label);
odp_ct_label.mask = wc->masks.ct_label;
nl_msg_put_unspec(odp_actions, OVS_CT_ATTR_LABELS,
&odp_ct_label, sizeof odp_ct_label);
}
}
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
static void
put_drop_action(struct ofpbuf *odp_actions, enum xlate_error error)
{
nl_msg_put_u32(odp_actions, OVS_ACTION_ATTR_DROP, error);
}
static void
put_ct_helper(struct xlate_ctx *ctx,
struct ofpbuf *odp_actions, struct ofpact_conntrack *ofc)
{
if (ofc->alg) {
switch(ofc->alg) {
case IPPORT_FTP:
nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "ftp");
break;
case IPPORT_TFTP:
nl_msg_put_string(odp_actions, OVS_CT_ATTR_HELPER, "tftp");
break;
default:
xlate_report_error(ctx, "cannot serialize ct_helper %d", ofc->alg);
break;
}
}
}
static void
put_ct_timeout(struct ofpbuf *odp_actions, const struct dpif_backer *backer,
const struct flow *flow, struct flow_wildcards *wc,
uint16_t zone_id)
{
bool unwildcard;
char *tp_name = NULL;
if (ofproto_dpif_ct_zone_timeout_policy_get_name(backer, zone_id,
ntohs(flow->dl_type), flow->nw_proto, &tp_name, &unwildcard)) {
nl_msg_put_string(odp_actions, OVS_CT_ATTR_TIMEOUT, tp_name);
if (unwildcard) {
/* The underlying datapath requires separate timeout
* policies for different Ethertypes and IP protocols. We
* don't need to unwildcard 'wc->masks.dl_type' since that
* field is always unwildcarded in megaflows. */
memset(&wc->masks.nw_proto, 0xff, sizeof wc->masks.nw_proto);
}
}
free(tp_name);
}
static void
put_ct_nat(struct xlate_ctx *ctx)
{
struct ofpact_nat *ofn = ctx->ct_nat_action;
size_t nat_offset;
if (!ofn) {
return;
}
nat_offset = nl_msg_start_nested(ctx->odp_actions, OVS_CT_ATTR_NAT);
if (ofn->flags & NX_NAT_F_SRC || ofn->flags & NX_NAT_F_DST) {
nl_msg_put_flag(ctx->odp_actions, ofn->flags & NX_NAT_F_SRC
? OVS_NAT_ATTR_SRC : OVS_NAT_ATTR_DST);
if (ofn->flags & NX_NAT_F_PERSISTENT) {
nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PERSISTENT);
}
if (ofn->flags & NX_NAT_F_PROTO_HASH) {
nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_HASH);
} else if (ofn->flags & NX_NAT_F_PROTO_RANDOM) {
nl_msg_put_flag(ctx->odp_actions, OVS_NAT_ATTR_PROTO_RANDOM);
}
if (ofn->range_af == AF_INET) {
nl_msg_put_be32(ctx->odp_actions, OVS_NAT_ATTR_IP_MIN,
ofn->range.addr.ipv4.min);
if (ofn->range.addr.ipv4.max &&
(ntohl(ofn->range.addr.ipv4.max)
> ntohl(ofn->range.addr.ipv4.min))) {
nl_msg_put_be32(ctx->odp_actions, OVS_NAT_ATTR_IP_MAX,
ofn->range.addr.ipv4.max);
}
} else if (ofn->range_af == AF_INET6) {
nl_msg_put_unspec(ctx->odp_actions, OVS_NAT_ATTR_IP_MIN,
&ofn->range.addr.ipv6.min,
sizeof ofn->range.addr.ipv6.min);
if (!ipv6_mask_is_any(&ofn->range.addr.ipv6.max) &&
memcmp(&ofn->range.addr.ipv6.max, &ofn->range.addr.ipv6.min,
sizeof ofn->range.addr.ipv6.max) > 0) {
nl_msg_put_unspec(ctx->odp_actions, OVS_NAT_ATTR_IP_MAX,
&ofn->range.addr.ipv6.max,
sizeof ofn->range.addr.ipv6.max);
}
}
if (ofn->range_af != AF_UNSPEC && ofn->range.proto.min) {
nl_msg_put_u16(ctx->odp_actions, OVS_NAT_ATTR_PROTO_MIN,
ofn->range.proto.min);
if (ofn->range.proto.max &&
ofn->range.proto.max > ofn->range.proto.min) {
nl_msg_put_u16(ctx->odp_actions, OVS_NAT_ATTR_PROTO_MAX,
ofn->range.proto.max);
}
}
}
nl_msg_end_nested(ctx->odp_actions, nat_offset);
}
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
static void
compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc,
bool is_last_action)
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
{
uint16_t zone;
ofproto-dpif-xlate: Fix zone set from non-frozen-metadata fields. CT zone could be set from a field that is not included in frozen metadata. Consider the example rules which are typically seen in OpenStack security group rules: priority=100,in_port=1,tcp,ct_state=-trk,action=ct(zone=5,table=0) priority=100,in_port=1,tcp,ct_state=+trk,action=ct(commit,zone=NXM_NX_CT_ZONE[]),2 The zone is set from the first rule's ct action. These two rules will generate two megaflows: the first one uses zone=5 to query the CT module, the second one sets the zone-id from the first megaflow and commit to CT. The current implementation will generate a megaflow that does not use ct_zone=5 as a match, but directly commit into the ct using zone=5, as zone is set by an Imm not a field. Consider a situation that one changes the zone id (for example to 15) in the first rule, however, still keep the second rule unchanged. During this change, there is traffic hitting the two generated megaflows, the revaldiator would revalidate all megaflows, however, the revalidator will not change the second megaflow, because zone=5 is recorded in the megaflow, so the xlate will still translate the commit action into zone=5, and the new traffic will still commit to CT as zone=5, not zone=15, resulting in taffic drops and other issues. Just like OVS set-field convention, if a field X is set by Y (Y is a variable not an Imm), we should also mask Y as a match in the generated megaflow. An exception is that if the zone-id is set by the field that is included in the frozen state (i.e. regs) and this upcall is a resume of a thawed xlate, the un-wildcarding can be skipped, as the recirc_id is a hash of the values in these fields, and it will change following the changes of these fields. When the recirc_id changes, all megaflows with the old recirc id will be invalid later. Fixes: 07659514c3 ("Add support for connection tracking.") Reported-by: Sai Su <susai.ss@bytedance.com> Signed-off-by: Peng He <hepeng.0320@bytedance.com> Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-01 21:09:11 +08:00
if (ofc->zone_src.field) {
union mf_subvalue *value = xmalloc(sizeof *value);
memset(value, 0xff, sizeof *value);
ofproto-dpif-xlate: Fix zone set from non-frozen-metadata fields. CT zone could be set from a field that is not included in frozen metadata. Consider the example rules which are typically seen in OpenStack security group rules: priority=100,in_port=1,tcp,ct_state=-trk,action=ct(zone=5,table=0) priority=100,in_port=1,tcp,ct_state=+trk,action=ct(commit,zone=NXM_NX_CT_ZONE[]),2 The zone is set from the first rule's ct action. These two rules will generate two megaflows: the first one uses zone=5 to query the CT module, the second one sets the zone-id from the first megaflow and commit to CT. The current implementation will generate a megaflow that does not use ct_zone=5 as a match, but directly commit into the ct using zone=5, as zone is set by an Imm not a field. Consider a situation that one changes the zone id (for example to 15) in the first rule, however, still keep the second rule unchanged. During this change, there is traffic hitting the two generated megaflows, the revaldiator would revalidate all megaflows, however, the revalidator will not change the second megaflow, because zone=5 is recorded in the megaflow, so the xlate will still translate the commit action into zone=5, and the new traffic will still commit to CT as zone=5, not zone=15, resulting in taffic drops and other issues. Just like OVS set-field convention, if a field X is set by Y (Y is a variable not an Imm), we should also mask Y as a match in the generated megaflow. An exception is that if the zone-id is set by the field that is included in the frozen state (i.e. regs) and this upcall is a resume of a thawed xlate, the un-wildcarding can be skipped, as the recirc_id is a hash of the values in these fields, and it will change following the changes of these fields. When the recirc_id changes, all megaflows with the old recirc id will be invalid later. Fixes: 07659514c3 ("Add support for connection tracking.") Reported-by: Sai Su <susai.ss@bytedance.com> Signed-off-by: Peng He <hepeng.0320@bytedance.com> Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-01 21:09:11 +08:00
zone = mf_get_subfield(&ofc->zone_src, &ctx->xin->flow);
if (ctx->xin->frozen_state) {
/* If the upcall is a resume of a recirculation, we only need to
* unwildcard the fields that are not in the frozen_metadata, as
* when the rules update, OVS will generate a new recirc_id,
* which will invalidate the megaflow with old the recirc_id.
*/
if (!mf_is_frozen_metadata(ofc->zone_src.field)) {
mf_write_subfield_flow(&ofc->zone_src, value,
ofproto-dpif-xlate: Fix zone set from non-frozen-metadata fields. CT zone could be set from a field that is not included in frozen metadata. Consider the example rules which are typically seen in OpenStack security group rules: priority=100,in_port=1,tcp,ct_state=-trk,action=ct(zone=5,table=0) priority=100,in_port=1,tcp,ct_state=+trk,action=ct(commit,zone=NXM_NX_CT_ZONE[]),2 The zone is set from the first rule's ct action. These two rules will generate two megaflows: the first one uses zone=5 to query the CT module, the second one sets the zone-id from the first megaflow and commit to CT. The current implementation will generate a megaflow that does not use ct_zone=5 as a match, but directly commit into the ct using zone=5, as zone is set by an Imm not a field. Consider a situation that one changes the zone id (for example to 15) in the first rule, however, still keep the second rule unchanged. During this change, there is traffic hitting the two generated megaflows, the revaldiator would revalidate all megaflows, however, the revalidator will not change the second megaflow, because zone=5 is recorded in the megaflow, so the xlate will still translate the commit action into zone=5, and the new traffic will still commit to CT as zone=5, not zone=15, resulting in taffic drops and other issues. Just like OVS set-field convention, if a field X is set by Y (Y is a variable not an Imm), we should also mask Y as a match in the generated megaflow. An exception is that if the zone-id is set by the field that is included in the frozen state (i.e. regs) and this upcall is a resume of a thawed xlate, the un-wildcarding can be skipped, as the recirc_id is a hash of the values in these fields, and it will change following the changes of these fields. When the recirc_id changes, all megaflows with the old recirc id will be invalid later. Fixes: 07659514c3 ("Add support for connection tracking.") Reported-by: Sai Su <susai.ss@bytedance.com> Signed-off-by: Peng He <hepeng.0320@bytedance.com> Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-01 21:09:11 +08:00
&ctx->wc->masks);
}
} else {
mf_write_subfield_flow(&ofc->zone_src, value, &ctx->wc->masks);
ofproto-dpif-xlate: Fix zone set from non-frozen-metadata fields. CT zone could be set from a field that is not included in frozen metadata. Consider the example rules which are typically seen in OpenStack security group rules: priority=100,in_port=1,tcp,ct_state=-trk,action=ct(zone=5,table=0) priority=100,in_port=1,tcp,ct_state=+trk,action=ct(commit,zone=NXM_NX_CT_ZONE[]),2 The zone is set from the first rule's ct action. These two rules will generate two megaflows: the first one uses zone=5 to query the CT module, the second one sets the zone-id from the first megaflow and commit to CT. The current implementation will generate a megaflow that does not use ct_zone=5 as a match, but directly commit into the ct using zone=5, as zone is set by an Imm not a field. Consider a situation that one changes the zone id (for example to 15) in the first rule, however, still keep the second rule unchanged. During this change, there is traffic hitting the two generated megaflows, the revaldiator would revalidate all megaflows, however, the revalidator will not change the second megaflow, because zone=5 is recorded in the megaflow, so the xlate will still translate the commit action into zone=5, and the new traffic will still commit to CT as zone=5, not zone=15, resulting in taffic drops and other issues. Just like OVS set-field convention, if a field X is set by Y (Y is a variable not an Imm), we should also mask Y as a match in the generated megaflow. An exception is that if the zone-id is set by the field that is included in the frozen state (i.e. regs) and this upcall is a resume of a thawed xlate, the un-wildcarding can be skipped, as the recirc_id is a hash of the values in these fields, and it will change following the changes of these fields. When the recirc_id changes, all megaflows with the old recirc id will be invalid later. Fixes: 07659514c3 ("Add support for connection tracking.") Reported-by: Sai Su <susai.ss@bytedance.com> Signed-off-by: Peng He <hepeng.0320@bytedance.com> Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-01 21:09:11 +08:00
}
free(value);
ofproto-dpif-xlate: Fix zone set from non-frozen-metadata fields. CT zone could be set from a field that is not included in frozen metadata. Consider the example rules which are typically seen in OpenStack security group rules: priority=100,in_port=1,tcp,ct_state=-trk,action=ct(zone=5,table=0) priority=100,in_port=1,tcp,ct_state=+trk,action=ct(commit,zone=NXM_NX_CT_ZONE[]),2 The zone is set from the first rule's ct action. These two rules will generate two megaflows: the first one uses zone=5 to query the CT module, the second one sets the zone-id from the first megaflow and commit to CT. The current implementation will generate a megaflow that does not use ct_zone=5 as a match, but directly commit into the ct using zone=5, as zone is set by an Imm not a field. Consider a situation that one changes the zone id (for example to 15) in the first rule, however, still keep the second rule unchanged. During this change, there is traffic hitting the two generated megaflows, the revaldiator would revalidate all megaflows, however, the revalidator will not change the second megaflow, because zone=5 is recorded in the megaflow, so the xlate will still translate the commit action into zone=5, and the new traffic will still commit to CT as zone=5, not zone=15, resulting in taffic drops and other issues. Just like OVS set-field convention, if a field X is set by Y (Y is a variable not an Imm), we should also mask Y as a match in the generated megaflow. An exception is that if the zone-id is set by the field that is included in the frozen state (i.e. regs) and this upcall is a resume of a thawed xlate, the un-wildcarding can be skipped, as the recirc_id is a hash of the values in these fields, and it will change following the changes of these fields. When the recirc_id changes, all megaflows with the old recirc id will be invalid later. Fixes: 07659514c3 ("Add support for connection tracking.") Reported-by: Sai Su <susai.ss@bytedance.com> Signed-off-by: Peng He <hepeng.0320@bytedance.com> Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-01 21:09:11 +08:00
} else {
zone = ofc->zone_imm;
}
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
ofproto-dpif-xlate: Fix zone set from non-frozen-metadata fields. CT zone could be set from a field that is not included in frozen metadata. Consider the example rules which are typically seen in OpenStack security group rules: priority=100,in_port=1,tcp,ct_state=-trk,action=ct(zone=5,table=0) priority=100,in_port=1,tcp,ct_state=+trk,action=ct(commit,zone=NXM_NX_CT_ZONE[]),2 The zone is set from the first rule's ct action. These two rules will generate two megaflows: the first one uses zone=5 to query the CT module, the second one sets the zone-id from the first megaflow and commit to CT. The current implementation will generate a megaflow that does not use ct_zone=5 as a match, but directly commit into the ct using zone=5, as zone is set by an Imm not a field. Consider a situation that one changes the zone id (for example to 15) in the first rule, however, still keep the second rule unchanged. During this change, there is traffic hitting the two generated megaflows, the revaldiator would revalidate all megaflows, however, the revalidator will not change the second megaflow, because zone=5 is recorded in the megaflow, so the xlate will still translate the commit action into zone=5, and the new traffic will still commit to CT as zone=5, not zone=15, resulting in taffic drops and other issues. Just like OVS set-field convention, if a field X is set by Y (Y is a variable not an Imm), we should also mask Y as a match in the generated megaflow. An exception is that if the zone-id is set by the field that is included in the frozen state (i.e. regs) and this upcall is a resume of a thawed xlate, the un-wildcarding can be skipped, as the recirc_id is a hash of the values in these fields, and it will change following the changes of these fields. When the recirc_id changes, all megaflows with the old recirc id will be invalid later. Fixes: 07659514c3 ("Add support for connection tracking.") Reported-by: Sai Su <susai.ss@bytedance.com> Signed-off-by: Peng He <hepeng.0320@bytedance.com> Acked-by: Mark D. Gray <mark.d.gray@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-08-01 21:09:11 +08:00
size_t ct_offset;
ovs_u128 old_ct_label_mask = ctx->wc->masks.ct_label;
uint32_t old_ct_mark_mask = ctx->wc->masks.ct_mark;
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
/* Ensure that any prior actions are applied before composing the new
* conntrack action. */
xlate_commit_actions(ctx);
/* Process nested actions first, to populate the key. */
ctx->ct_nat_action = NULL;
ctx->wc->masks.ct_mark = 0;
ctx->wc->masks.ct_label = OVS_U128_ZERO;
do_xlate_actions(ofc->actions, ofpact_ct_get_action_len(ofc), ctx,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
is_last_action, false);
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
ct_offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_CT);
if (ofc->flags & NX_CT_F_COMMIT) {
nl_msg_put_flag(ctx->odp_actions, ofc->flags & NX_CT_F_FORCE ?
OVS_CT_ATTR_FORCE_COMMIT : OVS_CT_ATTR_COMMIT);
if (ctx->xbridge->support.ct_eventmask) {
nl_msg_put_u32(ctx->odp_actions, OVS_CT_ATTR_EVENTMASK,
OVS_CT_EVENTMASK_DEFAULT);
}
if (ctx->xbridge->support.ct_timeout) {
put_ct_timeout(ctx->odp_actions, ctx->xbridge->ofproto->backer,
&ctx->xin->flow, ctx->wc, zone);
}
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
}
nl_msg_put_u16(ctx->odp_actions, OVS_CT_ATTR_ZONE, zone);
put_ct_mark(&ctx->xin->flow, ctx->odp_actions, ctx->wc);
put_ct_label(&ctx->xin->flow, ctx->odp_actions, ctx->wc);
put_ct_helper(ctx, ctx->odp_actions, ofc);
put_ct_nat(ctx);
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
nl_msg_end_nested(ctx->odp_actions, ct_offset);
ctx->wc->masks.ct_mark = old_ct_mark_mask;
ctx->wc->masks.ct_label = old_ct_label_mask;
if (ofc->recirc_table != NX_CT_RECIRC_NONE) {
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
ctx->conntracked = true;
compose_recirculate_and_fork(ctx, ofc->recirc_table, zone);
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
}
ctx->ct_nat_action = NULL;
/* The ct_* fields are only available in the scope of the 'recirc_table'
* call chain. */
flow_clear_conntrack(&ctx->xin->flow);
xlate_report(ctx, OFT_DETAIL, "Sets the packet to an untracked state, "
"and clears all the conntrack fields.");
ctx->conntracked = false;
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
}
static void
compose_ct_clear_action(struct xlate_ctx *ctx)
{
clear_conntrack(ctx);
/* This action originally existed without dpif support. So to preserve
* compatibility, only append it if the dpif supports it. */
if (ctx->xbridge->support.ct_clear) {
nl_msg_put_flag(ctx->odp_actions, OVS_ACTION_ATTR_CT_CLEAR);
}
}
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
/* check_pkt_larger action checks the packet length and stores the
* result in the register bit. We translate this action to the
* datapath action - 'check_pkt_len' whose format
* is: 'check_pkt_len(pkt_len, ge(actions), le(actions))'.
*
* We first set the destination register bit to 1 and call
* 'do_xlate_actions' for the case - packet len greater than
* the specified packet length.
*
* We then set the destination register bit to 0 and call
* 'do_xlate_actions' for the case - packet length is lesser or
* equal to the specified packet length.
*
* It is possible for freezing to happen for both the cases.
*/
static void
xlate_check_pkt_larger(struct xlate_ctx *ctx,
struct ofpact_check_pkt_larger *check_pkt_larger,
const struct ofpact *remaining_acts,
size_t remaining_acts_len)
{
union mf_subvalue *value = xmalloc(sizeof *value);
memset(value, 0, sizeof *value);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
if (!ctx->xbridge->support.check_pkt_len) {
uint8_t is_pkt_larger = 0;
if (ctx->xin->packet) {
is_pkt_larger =
dp_packet_size(ctx->xin->packet) > check_pkt_larger->pkt_len;
}
value->u8_val = is_pkt_larger;
mf_write_subfield_flow(&check_pkt_larger->dst, value,
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
&ctx->xin->flow);
/* If datapath doesn't support check_pkt_len action, then set the
* SLOW_ACTION flag. If we don't set SLOW_ACTION, we
* will push a flow to the datapath based on the packet length
* in ctx->xin->packet. For subsequent patches which match the
* same flow, datapath will apply the actions without considering
* the packet length. This results in wrong actions being applied.
*/
ctx->xout->slow |= SLOW_ACTION;
free(value);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
return;
}
struct xretained_state *retained_state;
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
retained_state = xretain_state_save(ctx);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
xlate_commit_actions(ctx);
xretain_base_flow_save(ctx, retained_state);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
bool old_was_mpls = ctx->was_mpls;
bool old_conntracked = ctx->conntracked;
size_t offset = nl_msg_start_nested(ctx->odp_actions,
OVS_ACTION_ATTR_CHECK_PKT_LEN);
nl_msg_put_u16(ctx->odp_actions, OVS_CHECK_PKT_LEN_ATTR_PKT_LEN,
check_pkt_larger->pkt_len);
size_t offset_attr = nl_msg_start_nested(
ctx->odp_actions, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER);
value->u8_val = 1;
mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
do_xlate_actions(remaining_acts, remaining_acts_len, ctx, true, false);
if (!ctx->freezing) {
xlate_action_set(ctx);
}
if (ctx->freezing) {
finish_freezing(ctx);
}
nl_msg_end_nested(ctx->odp_actions, offset_attr);
xretain_base_flow_restore(ctx, retained_state);
xretain_flow_restore(ctx, retained_state);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
ctx->was_mpls = old_was_mpls;
ctx->conntracked = old_conntracked;
/* If the flow translation for the IF_GREATER case requires freezing,
* then ctx->exit would be true. Reset to false so that we can
* do flow translation for 'IF_LESS_EQUAL' case. finish_freezing()
* would have taken care of Undoing the changes done for freeze. */
bool old_exit = ctx->exit;
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
ctx->exit = false;
offset_attr = nl_msg_start_nested(
ctx->odp_actions, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL);
value->u8_val = 0;
mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
do_xlate_actions(remaining_acts, remaining_acts_len, ctx, true, false);
if (!ctx->freezing) {
xlate_action_set(ctx);
}
if (ctx->freezing) {
finish_freezing(ctx);
}
nl_msg_end_nested(ctx->odp_actions, offset_attr);
nl_msg_end_nested(ctx->odp_actions, offset);
ctx->was_mpls = old_was_mpls;
ctx->conntracked = old_conntracked;
ctx->exit = old_exit;
xretain_base_flow_restore(ctx, retained_state);
xretain_state_restore_and_free(ctx, retained_state);
free(value);
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
}
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
static void
rewrite_flow_encap_ethernet(struct xlate_ctx *ctx,
struct flow *flow,
struct flow_wildcards *wc)
{
wc->masks.packet_type = OVS_BE32_MAX;
if (pt_ns(flow->packet_type) == OFPHTN_ETHERTYPE) {
/* Only adjust the packet_type and zero the dummy Ethernet addresses. */
ovs_be16 ethertype = pt_ns_type_be(flow->packet_type);
flow->packet_type = htonl(PT_ETH);
flow->dl_src = eth_addr_zero;
flow->dl_dst = eth_addr_zero;
flow->dl_type = ethertype;
} else {
/* Error handling: drop packet. */
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
xlate_report_debug(ctx, OFT_ACTION,
"Dropping packet as encap(ethernet) is not "
"supported for packet type ethernet.");
ctx->error = XLATE_UNSUPPORTED_PACKET_TYPE;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
}
}
2021-11-29 11:52:05 +05:30
static void
rewrite_flow_encap_mpls(struct xlate_ctx *ctx,
const struct ofpact_encap *encap,
struct flow *flow,
struct flow_wildcards *wc)
{
ovs_be16 ether_type = pt_ns_type_be(encap->new_pkt_type);
int n;
n = flow_count_mpls_labels(flow, ctx->wc);
if (n < FLOW_MAX_MPLS_LABELS) {
wc->masks.packet_type = OVS_BE32_MAX;
/* If the current packet is already a MPLS packet with ethernet header
* the existing MPLS states must be cleared before the encap MPLS action
* is applied. */
if (flow->packet_type == htonl(PT_ETH) &&
flow->dl_type == htons(ETH_TYPE_MPLS)) {
memset(&ctx->wc->masks.mpls_lse, 0x0,
sizeof *wc->masks.mpls_lse * FLOW_MAX_MPLS_LABELS);
memset(&flow->mpls_lse, 0x0, sizeof *flow->mpls_lse *
FLOW_MAX_MPLS_LABELS);
memset(&ctx->base_flow.mpls_lse, 0x0,
sizeof *ctx->base_flow.mpls_lse * FLOW_MAX_MPLS_LABELS);
}
flow->packet_type = encap->new_pkt_type;
flow_push_mpls(flow, n, ether_type, ctx->wc, true);
flow->dl_src = eth_addr_zero;
flow->dl_dst = eth_addr_zero;
} else {
if (ctx->xin->packet != NULL) {
xlate_report_error(ctx, "dropping packet on which an encap MPLS "
"action can't be performed as it would have "
"more MPLS LSEs than the %d supported.",
FLOW_MAX_MPLS_LABELS);
}
ctx->error = XLATE_TOO_MANY_MPLS_LABELS;
return;
}
}
/* For an MD2 NSH header returns a pointer to an ofpbuf with the encoded
* MD2 TLVs provided as encap properties to the encap operation. This
* will be stored as encap_data in the ctx and copied into the push_nsh
* action at the next commit. */
static struct ofpbuf *
rewrite_flow_push_nsh(struct xlate_ctx *ctx,
const struct ofpact_encap *encap,
struct flow *flow,
struct flow_wildcards *wc)
{
ovs_be32 packet_type = flow->packet_type;
const char *ptr = (char *) encap->props;
struct ofpbuf *buf = ofpbuf_new(NSH_CTX_HDRS_MAX_LEN);
uint8_t md_type = NSH_M_TYPE1;
uint8_t np = 0;
int i;
/* Scan the optional NSH encap TLV properties, if any. */
for (i = 0; i < encap->n_props; i++) {
struct ofpact_ed_prop *prop_ptr =
ALIGNED_CAST(struct ofpact_ed_prop *, ptr);
if (prop_ptr->prop_class == OFPPPC_NSH) {
switch (prop_ptr->type) {
case OFPPPT_PROP_NSH_MDTYPE: {
struct ofpact_ed_prop_nsh_md_type *prop_md_type =
ALIGNED_CAST(struct ofpact_ed_prop_nsh_md_type *,
prop_ptr);
md_type = prop_md_type->md_type;
break;
}
case OFPPPT_PROP_NSH_TLV: {
struct ofpact_ed_prop_nsh_tlv *tlv_prop =
ALIGNED_CAST(struct ofpact_ed_prop_nsh_tlv *,
prop_ptr);
struct nsh_md2_tlv *md2_ctx =
ofpbuf_put_uninit(buf, sizeof(*md2_ctx));
md2_ctx->md_class = tlv_prop->tlv_class;
md2_ctx->type = tlv_prop->tlv_type;
md2_ctx->length = tlv_prop->tlv_len;
size_t len = ROUND_UP(md2_ctx->length, 4);
size_t padding = len - md2_ctx->length;
ofpbuf_put(buf, tlv_prop->data, md2_ctx->length);
ofpbuf_put_zeros(buf, padding);
break;
}
default:
/* No other NSH encap properties defined yet. */
break;
}
}
ptr += ROUND_UP(prop_ptr->len, 8);
}
if (buf->size == 0 || buf->size > NSH_CTX_HDRS_MAX_LEN) {
ofpbuf_delete(buf);
buf = NULL;
}
/* Determine the Next Protocol field for NSH header. */
switch (ntohl(packet_type)) {
case PT_ETH:
np = NSH_P_ETHERNET;
break;
case PT_IPV4:
np = NSH_P_IPV4;
break;
case PT_IPV6:
np = NSH_P_IPV6;
break;
case PT_NSH:
np = NSH_P_NSH;
break;
default:
/* Error handling: drop packet. */
xlate_report_debug(ctx, OFT_ACTION,
"Dropping packet as encap(nsh) is not "
"supported for packet type (%d,0x%x)",
pt_ns(packet_type), pt_ns_type(packet_type));
ctx->error = XLATE_UNSUPPORTED_PACKET_TYPE;
return buf;
}
/* Note that we have matched on packet_type! */
wc->masks.packet_type = OVS_BE32_MAX;
/* Reset all current flow packet headers. */
memset(&flow->dl_dst, 0,
sizeof(struct flow) - offsetof(struct flow, dl_dst));
/* Populate the flow with the new NSH header. */
flow->packet_type = htonl(PT_NSH);
flow->dl_type = htons(ETH_TYPE_NSH);
flow->nsh.flags = 0;
flow->nsh.ttl = 63;
flow->nsh.np = np;
flow->nsh.path_hdr = htonl(255);
if (md_type == NSH_M_TYPE1) {
flow->nsh.mdtype = NSH_M_TYPE1;
memset(flow->nsh.context, 0, sizeof flow->nsh.context);
if (buf) {
/* Drop any MD2 context TLVs. */
ofpbuf_delete(buf);
buf = NULL;
}
} else if (md_type == NSH_M_TYPE2) {
flow->nsh.mdtype = NSH_M_TYPE2;
}
flow->nsh.mdtype &= NSH_MDTYPE_MASK;
return buf;
}
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
static void
xlate_generic_encap_action(struct xlate_ctx *ctx,
const struct ofpact_encap *encap)
{
struct flow *flow = &ctx->xin->flow;
struct flow_wildcards *wc = ctx->wc;
struct ofpbuf *encap_data = NULL;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
/* Ensure that any pending actions on the inner packet are applied before
* rewriting the flow */
xlate_commit_actions(ctx);
/* Rewrite the flow to reflect the effect of pushing the new encap header. */
switch (ntohl(encap->new_pkt_type)) {
case PT_ETH:
rewrite_flow_encap_ethernet(ctx, flow, wc);
break;
case PT_NSH:
encap_data = rewrite_flow_push_nsh(ctx, encap, flow, wc);
break;
2021-11-29 11:52:05 +05:30
case PT_MPLS:
case PT_MPLS_MC:
rewrite_flow_encap_mpls(ctx, encap, flow, wc);
if (!ctx->xbridge->support.add_mpls) {
ctx->xout->slow |= SLOW_ACTION;
}
break;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
default:
/* New packet type was checked during decoding. */
OVS_NOT_REACHED();
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
}
if (!ctx->error) {
/* The actual encap datapath action will be generated at next commit. */
ctx->pending_encap = true;
ctx->encap_data = encap_data;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
}
}
/* Returns true if packet must be recirculated after decapsulation. */
static bool
xlate_generic_decap_action(struct xlate_ctx *ctx,
const struct ofpact_decap *decap OVS_UNUSED)
{
struct flow *flow = &ctx->xin->flow;
/* Ensure that any pending actions on the current packet are applied
* before generating the decap action. */
xlate_commit_actions(ctx);
/* We assume for now that the new_pkt_type is PT_USE_NEXT_PROTO. */
switch (ntohl(flow->packet_type)) {
case PT_ETH:
if (flow->vlans[0].tci & htons(VLAN_CFI)) {
/* Error handling: drop packet. */
xlate_report_debug(ctx, OFT_ACTION, "Dropping packet, cannot "
"decap Ethernet if VLAN is present.");
ctx->error = XLATE_UNSUPPORTED_PACKET_TYPE;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
} else {
/* Just change the packet_type.
* Delay generating pop_eth to the next commit. */
flow->packet_type = htonl(PACKET_TYPE(OFPHTN_ETHERTYPE,
ntohs(flow->dl_type)));
flow->dl_src = eth_addr_zero;
flow->dl_dst = eth_addr_zero;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
ctx->wc->masks.dl_type = OVS_BE16_MAX;
}
return false;
case PT_NSH:
/* The pop_nsh action is generated at the commit executed as
* part of freezing the ctx for recirculation. Here we just set
* the new packet type based on the NSH next protocol field. */
switch (flow->nsh.np) {
case NSH_P_ETHERNET:
flow->packet_type = htonl(PT_ETH);
break;
case NSH_P_IPV4:
flow->packet_type = htonl(PT_IPV4);
break;
case NSH_P_IPV6:
flow->packet_type = htonl(PT_IPV6);
break;
case NSH_P_NSH:
flow->packet_type = htonl(PT_NSH);
break;
default:
/* Error handling: drop packet. */
xlate_report_debug(ctx, OFT_ACTION,
"Dropping packet as NSH next protocol %d "
"is not supported", flow->nsh.np);
ctx->error = XLATE_UNSUPPORTED_PACKET_TYPE;
return false;
break;
}
ctx->wc->masks.nsh.np = UINT8_MAX;
ctx->pending_decap = true;
/* Trigger recirculation. */
return true;
case PT_MPLS:
case PT_MPLS_MC: {
2021-11-29 11:52:05 +05:30
int n;
ovs_be16 ethertype;
flow->packet_type = decap->new_pkt_type;
ethertype = pt_ns_type_be(flow->packet_type);
n = flow_count_mpls_labels(flow, ctx->wc);
if (!ethertype) {
ethertype = htons(ETH_TYPE_TEB);
}
if (flow_pop_mpls(flow, n, ethertype, ctx->wc)) {
if (!ctx->xbridge->support.add_mpls) {
ctx->xout->slow |= SLOW_ACTION;
}
ctx->pending_decap = true;
if (n == 1) {
/* Trigger recirculation. */
return true;
} else {
return false;
}
} else if (n >= FLOW_MAX_MPLS_LABELS) {
if (ctx->xin->packet != NULL) {
xlate_report_error(ctx, "dropping packet on which an "
"MPLS decap can't be performed as "
"it has more MPLS LSEs than the %d "
"supported.",
FLOW_MAX_MPLS_LABELS);
}
ctx->error = XLATE_TOO_MANY_MPLS_LABELS;
ofpbuf_clear(ctx->odp_actions);
return false;
} else {
return false;
}
}
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
default:
/* Error handling: drop packet. */
xlate_report_debug(
ctx, OFT_ACTION,
"Dropping packet as the decap() does not support "
"packet type (%d,0x%x)",
pt_ns(flow->packet_type), pt_ns_type(flow->packet_type));
ctx->error = XLATE_UNSUPPORTED_PACKET_TYPE;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
return false;
}
}
static void
recirc_for_mpls(const struct ofpact *a, struct xlate_ctx *ctx)
{
/* No need to recirculate if already exiting. */
if (ctx->exit) {
return;
}
/* Do not consider recirculating unless the packet was previously MPLS. */
if (!ctx->was_mpls) {
return;
}
/* Special case these actions, only recirculating if necessary.
* This avoids the overhead of recirculation in common use-cases.
*/
switch (a->type) {
/* Output actions do not require recirculation. */
case OFPACT_OUTPUT:
case OFPACT_OUTPUT_TRUNC:
case OFPACT_ENQUEUE:
case OFPACT_OUTPUT_REG:
/* Set actions that don't touch L3+ fields do not require recirculation. */
case OFPACT_SET_VLAN_VID:
case OFPACT_SET_VLAN_PCP:
case OFPACT_SET_ETH_SRC:
case OFPACT_SET_ETH_DST:
case OFPACT_SET_TUNNEL:
case OFPACT_SET_QUEUE:
/* If actions of a group require recirculation that can be detected
* when translating them. */
case OFPACT_GROUP:
return;
/* Set field that don't touch L3+ fields don't require recirculation. */
case OFPACT_SET_FIELD:
if (mf_is_l3_or_higher(ofpact_get_SET_FIELD(a)->field)) {
break;
}
return;
/* For simplicity, recirculate in all other cases. */
case OFPACT_CONTROLLER:
case OFPACT_BUNDLE:
case OFPACT_STRIP_VLAN:
case OFPACT_PUSH_VLAN:
case OFPACT_SET_IPV4_SRC:
case OFPACT_SET_IPV4_DST:
case OFPACT_SET_IP_DSCP:
case OFPACT_SET_IP_ECN:
case OFPACT_SET_IP_TTL:
case OFPACT_SET_L4_SRC_PORT:
case OFPACT_SET_L4_DST_PORT:
case OFPACT_REG_MOVE:
case OFPACT_STACK_PUSH:
case OFPACT_STACK_POP:
case OFPACT_DEC_TTL:
case OFPACT_SET_MPLS_LABEL:
case OFPACT_SET_MPLS_TC:
case OFPACT_SET_MPLS_TTL:
case OFPACT_DEC_MPLS_TTL:
case OFPACT_PUSH_MPLS:
case OFPACT_POP_MPLS:
case OFPACT_POP_QUEUE:
case OFPACT_FIN_TIMEOUT:
case OFPACT_RESUBMIT:
case OFPACT_LEARN:
case OFPACT_CONJUNCTION:
case OFPACT_MULTIPATH:
case OFPACT_NOTE:
case OFPACT_EXIT:
case OFPACT_SAMPLE:
case OFPACT_CLONE:
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
case OFPACT_ENCAP:
case OFPACT_DECAP:
case OFPACT_DEC_NSH_TTL:
case OFPACT_UNROLL_XLATE:
case OFPACT_CT:
case OFPACT_CT_CLEAR:
case OFPACT_NAT:
case OFPACT_DEBUG_RECIRC:
case OFPACT_DEBUG_SLOW:
case OFPACT_METER:
case OFPACT_CLEAR_ACTIONS:
case OFPACT_WRITE_ACTIONS:
case OFPACT_WRITE_METADATA:
case OFPACT_GOTO_TABLE:
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
case OFPACT_CHECK_PKT_LARGER:
case OFPACT_DELETE_FIELD:
default:
break;
}
/* Recirculate */
ctx_trigger_freeze(ctx);
}
static void
xlate_ofpact_reg_move(struct xlate_ctx *ctx, const struct ofpact_reg_move *a)
{
mf_subfield_copy(&a->src, &a->dst, &ctx->xin->flow, ctx->wc);
xlate_report_subfield(ctx, &a->dst);
}
static void
xlate_ofpact_stack_pop(struct xlate_ctx *ctx, const struct ofpact_stack *a)
{
if (nxm_execute_stack_pop(a, &ctx->xin->flow, ctx->wc, &ctx->stack)) {
xlate_report_subfield(ctx, &a->subfield);
} else {
xlate_report_error(ctx, "stack underflow");
}
}
/* Restore translation context data that was stored earlier. */
static void
xlate_ofpact_unroll_xlate(struct xlate_ctx *ctx,
const struct ofpact_unroll_xlate *a)
{
ctx->table_id = a->rule_table_id;
ctx->rule_cookie = a->rule_cookie;
xlate_report(ctx, OFT_THAW, "restored state: table=%"PRIu8", "
"cookie=%#"PRIx64, a->rule_table_id, a->rule_cookie);
}
/* Reset the mirror context if we modify the packet and would like to mirror
* the new copy. */
static void
reset_mirror_ctx(struct xlate_ctx *ctx, const struct flow *flow,
const struct ofpact *a)
{
switch (a->type) {
case OFPACT_STRIP_VLAN:
case OFPACT_PUSH_VLAN:
case OFPACT_SET_ETH_SRC:
case OFPACT_SET_ETH_DST:
case OFPACT_PUSH_MPLS:
case OFPACT_POP_MPLS:
case OFPACT_SET_MPLS_LABEL:
case OFPACT_SET_MPLS_TC:
case OFPACT_SET_MPLS_TTL:
case OFPACT_DEC_MPLS_TTL:
case OFPACT_DEC_NSH_TTL:
case OFPACT_DEC_TTL:
case OFPACT_SET_VLAN_VID:
case OFPACT_SET_VLAN_PCP:
case OFPACT_ENCAP:
case OFPACT_DECAP:
case OFPACT_NAT:
ctx->mirrors = 0;
return;
case OFPACT_SET_FIELD: {
const struct ofpact_set_field *set_field;
const struct mf_field *mf;
set_field = ofpact_get_SET_FIELD(a);
mf = set_field->field;
if (mf_are_prereqs_ok(mf, flow, NULL)) {
ctx->mirrors = 0;
}
return;
}
case OFPACT_SET_IPV4_SRC:
case OFPACT_SET_IPV4_DST:
if (flow->dl_type == htons(ETH_TYPE_IP)) {
ctx->mirrors = 0;
}
return;
case OFPACT_SET_IP_DSCP:
case OFPACT_SET_IP_ECN:
case OFPACT_SET_IP_TTL:
if (is_ip_any(flow)) {
ctx->mirrors = 0;
}
return;
case OFPACT_SET_L4_SRC_PORT:
case OFPACT_SET_L4_DST_PORT:
if (is_ip_any(flow) && !(flow->nw_frag & FLOW_NW_FRAG_LATER)) {
ctx->mirrors = 0;
}
return;
case OFPACT_OUTPUT_REG:
case OFPACT_OUTPUT_TRUNC:
case OFPACT_GROUP:
case OFPACT_OUTPUT:
case OFPACT_CONTROLLER:
case OFPACT_RESUBMIT:
case OFPACT_GOTO_TABLE:
case OFPACT_WRITE_METADATA:
case OFPACT_SET_TUNNEL:
case OFPACT_REG_MOVE:
case OFPACT_STACK_PUSH:
case OFPACT_STACK_POP:
case OFPACT_LEARN:
case OFPACT_ENQUEUE:
case OFPACT_SET_QUEUE:
case OFPACT_POP_QUEUE:
case OFPACT_MULTIPATH:
case OFPACT_BUNDLE:
case OFPACT_EXIT:
case OFPACT_UNROLL_XLATE:
case OFPACT_FIN_TIMEOUT:
case OFPACT_CLEAR_ACTIONS:
case OFPACT_WRITE_ACTIONS:
case OFPACT_METER:
case OFPACT_SAMPLE:
case OFPACT_CLONE:
case OFPACT_DEBUG_RECIRC:
case OFPACT_DEBUG_SLOW:
case OFPACT_CT:
case OFPACT_CT_CLEAR:
case OFPACT_CHECK_PKT_LARGER:
case OFPACT_DELETE_FIELD:
case OFPACT_NOTE:
case OFPACT_CONJUNCTION:
return;
}
OVS_NOT_REACHED();
}
static void
xlate_trace(struct xlate_ctx *ctx, const struct ofpact *a)
{
struct ofputil_port_map *map;
map = xmalloc(sizeof *map);
ofputil_port_map_init(map);
if (ctx->xin->names) {
struct ofproto_dpif *ofprotop;
ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name);
ofproto_append_ports_to_map(map, ofprotop->up.ports);
}
struct ds s = DS_EMPTY_INITIALIZER;
struct ofpact_format_params fp = { .s = &s, .port_map = map };
ofpacts_format(a, OFPACT_ALIGN(a->len), &fp);
xlate_report(ctx, OFT_ACTION, "%s", ds_cstr(&s));
ds_destroy(&s);
ofputil_port_map_destroy(map);
free(map);
}
static void
do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
struct xlate_ctx *ctx, bool is_last_action,
bool group_bucket_action)
{
struct flow_wildcards *wc = ctx->wc;
struct flow *flow = &ctx->xin->flow;
const struct ofpact *a;
/* dl_type already in the mask, not set below. */
if (!ofpacts_len) {
xlate_report(ctx, OFT_ACTION, "drop");
return;
}
bool exit = false;
OFPACT_FOR_EACH (a, ofpacts, ofpacts_len) {
struct ofpact_controller *controller;
const struct ofpact_metadata *metadata;
const struct ofpact_set_field *set_field;
const struct mf_field *mf;
bool last = is_last_action && ofpact_last(a, ofpacts, ofpacts_len)
&& !ctx->action_set.size;
if (ctx->error) {
break;
}
recirc_for_mpls(a, ctx);
if (ctx->exit || exit) {
/* Check if need to store the remaining actions for later
* execution. */
if (ctx->freezing) {
freeze_unroll_actions(a, ofpact_end(ofpacts, ofpacts_len),
ctx);
}
break;
}
reset_mirror_ctx(ctx, flow, a);
if (OVS_UNLIKELY(ctx->xin->trace)) {
xlate_trace(ctx, a);
}
switch (a->type) {
case OFPACT_OUTPUT:
xlate_output_action(ctx, ofpact_get_OUTPUT(a)->port,
ofpact_get_OUTPUT(a)->max_len, true, last,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
false, group_bucket_action);
break;
case OFPACT_GROUP:
if (xlate_group_action(ctx, ofpact_get_GROUP(a)->group_id, last)) {
/* Group could not be found. */
/* XXX: Terminates action list translation, but does not
* terminate the pipeline. */
return;
}
break;
case OFPACT_CONTROLLER:
controller = ofpact_get_CONTROLLER(a);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
if (controller->pause) {
ctx->pause = controller;
ctx_trigger_freeze(ctx);
a = ofpact_next(a);
} else {
xlate_controller_action(ctx, controller->max_len,
controller->reason,
controller->controller_id,
controller->provider_meter_id,
controller->userdata,
controller->userdata_len);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
}
break;
case OFPACT_ENQUEUE:
memset(&wc->masks.skb_priority, 0xff,
sizeof wc->masks.skb_priority);
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
xlate_enqueue_action(ctx, ofpact_get_ENQUEUE(a), last,
group_bucket_action);
break;
case OFPACT_SET_VLAN_VID:
wc->masks.vlans[0].tci |= htons(VLAN_VID_MASK | VLAN_CFI);
if (flow->vlans[0].tci & htons(VLAN_CFI) ||
ofpact_get_SET_VLAN_VID(a)->push_vlan_if_needed) {
if (!flow->vlans[0].tpid) {
flow->vlans[0].tpid = htons(ETH_TYPE_VLAN);
}
flow->vlans[0].tci &= ~htons(VLAN_VID_MASK);
flow->vlans[0].tci |=
(htons(ofpact_get_SET_VLAN_VID(a)->vlan_vid) |
htons(VLAN_CFI));
}
break;
case OFPACT_SET_VLAN_PCP:
wc->masks.vlans[0].tci |= htons(VLAN_PCP_MASK | VLAN_CFI);
if (flow->vlans[0].tci & htons(VLAN_CFI) ||
ofpact_get_SET_VLAN_PCP(a)->push_vlan_if_needed) {
if (!flow->vlans[0].tpid) {
flow->vlans[0].tpid = htons(ETH_TYPE_VLAN);
}
flow->vlans[0].tci &= ~htons(VLAN_PCP_MASK);
flow->vlans[0].tci |=
htons((ofpact_get_SET_VLAN_PCP(a)->vlan_pcp
<< VLAN_PCP_SHIFT) | VLAN_CFI);
}
break;
case OFPACT_STRIP_VLAN:
flow_pop_vlan(flow, wc);
break;
case OFPACT_PUSH_VLAN:
flow_push_vlan_uninit(flow, wc);
flow->vlans[0].tpid = ofpact_get_PUSH_VLAN(a)->ethertype;
flow->vlans[0].tci = htons(VLAN_CFI);
break;
case OFPACT_SET_ETH_SRC:
WC_MASK_FIELD(wc, dl_src);
flow->dl_src = ofpact_get_SET_ETH_SRC(a)->mac;
break;
case OFPACT_SET_ETH_DST:
WC_MASK_FIELD(wc, dl_dst);
flow->dl_dst = ofpact_get_SET_ETH_DST(a)->mac;
break;
case OFPACT_SET_IPV4_SRC:
if (flow->dl_type == htons(ETH_TYPE_IP)) {
memset(&wc->masks.nw_src, 0xff, sizeof wc->masks.nw_src);
WC_MASK_FIELD(wc, nw_proto);
flow->nw_src = ofpact_get_SET_IPV4_SRC(a)->ipv4;
}
break;
case OFPACT_SET_IPV4_DST:
if (flow->dl_type == htons(ETH_TYPE_IP)) {
memset(&wc->masks.nw_dst, 0xff, sizeof wc->masks.nw_dst);
WC_MASK_FIELD(wc, nw_proto);
flow->nw_dst = ofpact_get_SET_IPV4_DST(a)->ipv4;
}
break;
case OFPACT_SET_IP_DSCP:
if (is_ip_any(flow)) {
WC_MASK_FIELD(wc, nw_proto);
wc->masks.nw_tos |= IP_DSCP_MASK;
flow->nw_tos &= ~IP_DSCP_MASK;
flow->nw_tos |= ofpact_get_SET_IP_DSCP(a)->dscp;
}
break;
case OFPACT_SET_IP_ECN:
if (is_ip_any(flow)) {
WC_MASK_FIELD(wc, nw_proto);
wc->masks.nw_tos |= IP_ECN_MASK;
flow->nw_tos &= ~IP_ECN_MASK;
flow->nw_tos |= ofpact_get_SET_IP_ECN(a)->ecn;
}
break;
case OFPACT_SET_IP_TTL:
if (is_ip_any(flow)) {
WC_MASK_FIELD(wc, nw_proto);
wc->masks.nw_ttl = 0xff;
flow->nw_ttl = ofpact_get_SET_IP_TTL(a)->ttl;
}
break;
case OFPACT_SET_L4_SRC_PORT:
Fix setting transport ports with frags. Packets with 'LATER' fragment do not have a transport header, so it is not possible to either match on or set transport ports on such packets. Matching is prevented by augmenting mf_are_prereqs_ok() with a nw_frag 'LATER' bit check. Setting the transport headers on such packets is prevented in three ways: 1. Flows with an explicit match on nw_frag, where the LATER bit is 1: existing calls to the modified mf_are_prereqs_ok() prohibit using transport header fields (port numbers) in OXM/NXM actions (set_field, move). SET_TP_* actions need a new check on the LATER bit. 2. Flows that wildcard the nw_frag LATER bit: At flow translation time, add calls to mf_are_prereqs_ok() to make sure that we do not use transport ports in flows that do not have them. 3. At action execution time, do not set transport ports, if the packet does not have a full transport header. This ensures that we never call the packet_set functions, that require a valid transport header, with packets that do not have them. For example, if the flow was created with a IPv6 first fragment that had the full TCP header, but the next packet's first fragment is missing them. 3 alone would suffice for correct behavior, but 1 and 2 seem like a right thing to do, anyway. Currently, if we are setting port numbers, we will also match them, due to us tracking the set fields with the same flow_wildcards as the matched fields. Hence, if the incoming port number was not zero, the flow would not match any packets with missing or truncated transport headers. However, relying on no packets having zero port numbers would not be very robust. Also, we may separate the tracking of set and matched fields in the future, which would allow some flows that blindly set port numbers to not match on them at all. For TCP in case 3 we use ofpbuf_get_tcp_payload() that requires the whole (potentially variable size) TCP header to be present. However, when parsing a flow, we only require the fixed size portion of the TCP header to be present, which would be enough to set the port numbers and fix the TCP checksum. Finally, we add tests testing the new behavior. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-11-05 10:10:13 -08:00
if (is_ip_any(flow) && !(flow->nw_frag & FLOW_NW_FRAG_LATER)) {
memset(&wc->masks.nw_proto, 0xff, sizeof wc->masks.nw_proto);
memset(&wc->masks.tp_src, 0xff, sizeof wc->masks.tp_src);
flow->tp_src = htons(ofpact_get_SET_L4_SRC_PORT(a)->port);
}
break;
case OFPACT_SET_L4_DST_PORT:
Fix setting transport ports with frags. Packets with 'LATER' fragment do not have a transport header, so it is not possible to either match on or set transport ports on such packets. Matching is prevented by augmenting mf_are_prereqs_ok() with a nw_frag 'LATER' bit check. Setting the transport headers on such packets is prevented in three ways: 1. Flows with an explicit match on nw_frag, where the LATER bit is 1: existing calls to the modified mf_are_prereqs_ok() prohibit using transport header fields (port numbers) in OXM/NXM actions (set_field, move). SET_TP_* actions need a new check on the LATER bit. 2. Flows that wildcard the nw_frag LATER bit: At flow translation time, add calls to mf_are_prereqs_ok() to make sure that we do not use transport ports in flows that do not have them. 3. At action execution time, do not set transport ports, if the packet does not have a full transport header. This ensures that we never call the packet_set functions, that require a valid transport header, with packets that do not have them. For example, if the flow was created with a IPv6 first fragment that had the full TCP header, but the next packet's first fragment is missing them. 3 alone would suffice for correct behavior, but 1 and 2 seem like a right thing to do, anyway. Currently, if we are setting port numbers, we will also match them, due to us tracking the set fields with the same flow_wildcards as the matched fields. Hence, if the incoming port number was not zero, the flow would not match any packets with missing or truncated transport headers. However, relying on no packets having zero port numbers would not be very robust. Also, we may separate the tracking of set and matched fields in the future, which would allow some flows that blindly set port numbers to not match on them at all. For TCP in case 3 we use ofpbuf_get_tcp_payload() that requires the whole (potentially variable size) TCP header to be present. However, when parsing a flow, we only require the fixed size portion of the TCP header to be present, which would be enough to set the port numbers and fix the TCP checksum. Finally, we add tests testing the new behavior. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-11-05 10:10:13 -08:00
if (is_ip_any(flow) && !(flow->nw_frag & FLOW_NW_FRAG_LATER)) {
memset(&wc->masks.nw_proto, 0xff, sizeof wc->masks.nw_proto);
memset(&wc->masks.tp_dst, 0xff, sizeof wc->masks.tp_dst);
flow->tp_dst = htons(ofpact_get_SET_L4_DST_PORT(a)->port);
}
break;
ofproto-dpif-xlate: Fix continuations with OF instructions in OF1.1+. Open vSwitch supports OpenFlow "instructions", which were introduced in OpenFlow 1.1 and act like restricted kinds of actions that can only appear in a particular order and particular circumstances. OVS did not support two of these instructions, "write_metadata" and "goto_table", properly in the case where they appeared in a flow that needed to be frozen for continuations. Both of these instructions had the problem that they couldn't be properly serialized into the stream of actions, because they're not actions. This commit fixes that problem in freeze_unroll_actions() by converting them into equivalent actions for serialization. goto_table had the additional problem that it was being serialized to the frozen stream even after it had been executed. This was already properly handled in do_xlate_actions() for resubmit, which is almost equivalent to goto_table, so this commit applies the same fix to goto_table. (The commit removes an assertion from the goto_table implementation, but there wasn't any real value in that assertion and I thought the code looked cleaner without it.) This commit adds tests that would have found these bugs. This includes adding a variant of each continuation test that uses OF1.3 for monitor/resume (which is necessary to trigger these bugs) plus specific tests for continuations with goto_table and write_metadata. It also improves the continuation test infrastructure to add more detail on the problem if a test fails. Signed-off-by: Ben Pfaff <blp@ovn.org> Reported-by: Grayson Wu <wgrayson@vmware.com> Reported-at: https://github.com/openvswitch/ovs-issues/issues/213 Discussed-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/386166.html Acked-by: Ilya Maximets <i.maximets@ovn.org>
2021-07-07 11:51:50 -07:00
/* Freezing complicates resubmit and goto_table. Some action in the
* flow entry found by resubmit might trigger freezing. If that
* happens, then we do not want to execute the resubmit or goto_table
* again after during thawing, so we want to skip back to the head of
* the loop to avoid that, only adding any actions that follow the
* resubmit to the frozen actions.
*/
case OFPACT_RESUBMIT:
xlate_ofpact_resubmit(ctx, ofpact_get_RESUBMIT(a), last);
continue;
ofproto-dpif-xlate: Fix continuations with OF instructions in OF1.1+. Open vSwitch supports OpenFlow "instructions", which were introduced in OpenFlow 1.1 and act like restricted kinds of actions that can only appear in a particular order and particular circumstances. OVS did not support two of these instructions, "write_metadata" and "goto_table", properly in the case where they appeared in a flow that needed to be frozen for continuations. Both of these instructions had the problem that they couldn't be properly serialized into the stream of actions, because they're not actions. This commit fixes that problem in freeze_unroll_actions() by converting them into equivalent actions for serialization. goto_table had the additional problem that it was being serialized to the frozen stream even after it had been executed. This was already properly handled in do_xlate_actions() for resubmit, which is almost equivalent to goto_table, so this commit applies the same fix to goto_table. (The commit removes an assertion from the goto_table implementation, but there wasn't any real value in that assertion and I thought the code looked cleaner without it.) This commit adds tests that would have found these bugs. This includes adding a variant of each continuation test that uses OF1.3 for monitor/resume (which is necessary to trigger these bugs) plus specific tests for continuations with goto_table and write_metadata. It also improves the continuation test infrastructure to add more detail on the problem if a test fails. Signed-off-by: Ben Pfaff <blp@ovn.org> Reported-by: Grayson Wu <wgrayson@vmware.com> Reported-at: https://github.com/openvswitch/ovs-issues/issues/213 Discussed-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/386166.html Acked-by: Ilya Maximets <i.maximets@ovn.org>
2021-07-07 11:51:50 -07:00
case OFPACT_GOTO_TABLE:
xlate_table_action(ctx, ctx->xin->flow.in_port.ofp_port,
ofpact_get_GOTO_TABLE(a)->table_id,
true, true, false, last, do_xlate_actions);
continue;
case OFPACT_SET_TUNNEL:
flow->tunnel.tun_id = htonll(ofpact_get_SET_TUNNEL(a)->tun_id);
break;
case OFPACT_SET_QUEUE:
memset(&wc->masks.skb_priority, 0xff,
sizeof wc->masks.skb_priority);
xlate_set_queue_action(ctx, ofpact_get_SET_QUEUE(a)->queue_id);
break;
case OFPACT_POP_QUEUE:
memset(&wc->masks.skb_priority, 0xff,
sizeof wc->masks.skb_priority);
if (flow->skb_priority != ctx->orig_skb_priority) {
flow->skb_priority = ctx->orig_skb_priority;
xlate_report(ctx, OFT_DETAIL, "queue = %#"PRIx32,
flow->skb_priority);
}
break;
case OFPACT_REG_MOVE:
xlate_ofpact_reg_move(ctx, ofpact_get_REG_MOVE(a));
break;
case OFPACT_SET_FIELD:
set_field = ofpact_get_SET_FIELD(a);
mf = set_field->field;
/* Set the field only if the packet actually has it. */
if (mf_are_prereqs_ok(mf, flow, wc)) {
mf_set_mask_l3_prereqs(mf, flow, wc);
mf_mask_field_masked(mf, ofpact_set_field_mask(set_field), wc);
mf_set_flow_value_masked(mf, set_field->value,
ofpact_set_field_mask(set_field),
flow);
} else {
xlate_report(ctx, OFT_WARN,
"unmet prerequisites for %s, set_field ignored",
mf->name);
Fix setting transport ports with frags. Packets with 'LATER' fragment do not have a transport header, so it is not possible to either match on or set transport ports on such packets. Matching is prevented by augmenting mf_are_prereqs_ok() with a nw_frag 'LATER' bit check. Setting the transport headers on such packets is prevented in three ways: 1. Flows with an explicit match on nw_frag, where the LATER bit is 1: existing calls to the modified mf_are_prereqs_ok() prohibit using transport header fields (port numbers) in OXM/NXM actions (set_field, move). SET_TP_* actions need a new check on the LATER bit. 2. Flows that wildcard the nw_frag LATER bit: At flow translation time, add calls to mf_are_prereqs_ok() to make sure that we do not use transport ports in flows that do not have them. 3. At action execution time, do not set transport ports, if the packet does not have a full transport header. This ensures that we never call the packet_set functions, that require a valid transport header, with packets that do not have them. For example, if the flow was created with a IPv6 first fragment that had the full TCP header, but the next packet's first fragment is missing them. 3 alone would suffice for correct behavior, but 1 and 2 seem like a right thing to do, anyway. Currently, if we are setting port numbers, we will also match them, due to us tracking the set fields with the same flow_wildcards as the matched fields. Hence, if the incoming port number was not zero, the flow would not match any packets with missing or truncated transport headers. However, relying on no packets having zero port numbers would not be very robust. Also, we may separate the tracking of set and matched fields in the future, which would allow some flows that blindly set port numbers to not match on them at all. For TCP in case 3 we use ofpbuf_get_tcp_payload() that requires the whole (potentially variable size) TCP header to be present. However, when parsing a flow, we only require the fixed size portion of the TCP header to be present, which would be enough to set the port numbers and fix the TCP checksum. Finally, we add tests testing the new behavior. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2014-11-05 10:10:13 -08:00
}
break;
case OFPACT_STACK_PUSH:
nxm_execute_stack_push(ofpact_get_STACK_PUSH(a), flow, wc,
&ctx->stack);
break;
case OFPACT_STACK_POP:
xlate_ofpact_stack_pop(ctx, ofpact_get_STACK_POP(a));
break;
case OFPACT_PUSH_MPLS:
compose_mpls_push_action(ctx, ofpact_get_PUSH_MPLS(a));
break;
case OFPACT_POP_MPLS:
compose_mpls_pop_action(ctx, ofpact_get_POP_MPLS(a)->ethertype);
break;
case OFPACT_SET_MPLS_LABEL:
compose_set_mpls_label_action(
ctx, ofpact_get_SET_MPLS_LABEL(a)->label);
break;
case OFPACT_SET_MPLS_TC:
compose_set_mpls_tc_action(ctx, ofpact_get_SET_MPLS_TC(a)->tc);
break;
case OFPACT_SET_MPLS_TTL:
compose_set_mpls_ttl_action(ctx, ofpact_get_SET_MPLS_TTL(a)->ttl);
break;
case OFPACT_DEC_MPLS_TTL:
if (compose_dec_mpls_ttl_action(ctx)) {
return;
}
break;
case OFPACT_DEC_NSH_TTL:
if (compose_dec_nsh_ttl_action(ctx)) {
return;
}
break;
case OFPACT_DEC_TTL:
wc->masks.nw_ttl = 0xff;
WC_MASK_FIELD(wc, nw_proto);
if (compose_dec_ttl(ctx, ofpact_get_DEC_TTL(a))) {
return;
}
break;
case OFPACT_NOTE:
/* Nothing to do. */
break;
case OFPACT_MULTIPATH:
multipath_execute(ofpact_get_MULTIPATH(a), flow, wc);
xlate_report_subfield(ctx, &ofpact_get_MULTIPATH(a)->dst);
break;
case OFPACT_BUNDLE:
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
xlate_bundle_action(ctx, ofpact_get_BUNDLE(a), last,
group_bucket_action);
break;
case OFPACT_OUTPUT_REG:
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
xlate_output_reg_action(ctx, ofpact_get_OUTPUT_REG(a), last,
group_bucket_action);
break;
case OFPACT_OUTPUT_TRUNC:
xlate_output_trunc_action(ctx, ofpact_get_OUTPUT_TRUNC(a)->port,
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
ofpact_get_OUTPUT_TRUNC(a)->max_len, last,
group_bucket_action);
break;
case OFPACT_LEARN:
xlate_learn_action(ctx, ofpact_get_LEARN(a));
break;
case OFPACT_CONJUNCTION:
/* A flow with a "conjunction" action represents part of a special
* kind of "set membership match". Such a flow should not actually
* get executed, but it could via, say, a "packet-out", even though
* that wouldn't be useful. Log it to help debugging. */
xlate_report_error(ctx, "executing no-op conjunction action");
break;
case OFPACT_EXIT:
ctx->exit = true;
break;
case OFPACT_UNROLL_XLATE:
xlate_ofpact_unroll_xlate(ctx, ofpact_get_UNROLL_XLATE(a));
break;
case OFPACT_FIN_TIMEOUT:
memset(&wc->masks.nw_proto, 0xff, sizeof wc->masks.nw_proto);
xlate_fin_timeout(ctx, ofpact_get_FIN_TIMEOUT(a));
break;
case OFPACT_DELETE_FIELD:
xlate_delete_field(ctx, flow, ofpact_get_DELETE_FIELD(a));
break;
case OFPACT_CLEAR_ACTIONS:
xlate_report_action_set(ctx, "was");
ofpbuf_clear(&ctx->action_set);
ctx->xin->flow.actset_output = OFPP_UNSET;
ctx->action_set_has_group = false;
break;
case OFPACT_WRITE_ACTIONS:
xlate_write_actions(ctx, ofpact_get_WRITE_ACTIONS(a));
xlate_report_action_set(ctx, "is");
break;
case OFPACT_WRITE_METADATA:
metadata = ofpact_get_WRITE_METADATA(a);
flow->metadata &= ~metadata->mask;
flow->metadata |= metadata->metadata & metadata->mask;
break;
case OFPACT_METER:
xlate_meter_action(ctx, ofpact_get_METER(a));
break;
case OFPACT_SAMPLE:
xlate_sample_action(ctx, ofpact_get_SAMPLE(a));
break;
case OFPACT_CLONE:
compose_clone(ctx, ofpact_get_CLONE(a), last);
break;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
case OFPACT_ENCAP:
xlate_generic_encap_action(ctx, ofpact_get_ENCAP(a));
break;
case OFPACT_DECAP: {
bool recirc_needed =
xlate_generic_decap_action(ctx, ofpact_get_DECAP(a));
if (!ctx->error && recirc_needed) {
/* Recirculate for parsing of inner packet. */
ctx_trigger_freeze(ctx);
/* Then continue with next action. */
a = ofpact_next(a);
}
break;
}
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
case OFPACT_CT:
compose_conntrack_action(ctx, ofpact_get_CT(a), last);
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
break;
case OFPACT_CT_CLEAR:
if (ctx->conntracked) {
compose_ct_clear_action(ctx);
}
break;
case OFPACT_NAT:
/* This will be processed by compose_conntrack_action(). */
ctx->ct_nat_action = ofpact_get_NAT(a);
break;
case OFPACT_DEBUG_RECIRC:
ctx_trigger_freeze(ctx);
a = ofpact_next(a);
break;
case OFPACT_DEBUG_SLOW:
ctx->xout->slow |= SLOW_ACTION;
break;
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
case OFPACT_CHECK_PKT_LARGER: {
const struct ofpact *remaining_acts = ofpact_next(a);
size_t remaining_acts_len = ofpact_remaining_len(remaining_acts,
ofpacts,
ofpacts_len);
xlate_check_pkt_larger(ctx, ofpact_get_CHECK_PKT_LARGER(a),
remaining_acts, remaining_acts_len);
if (ctx->xbridge->support.check_pkt_len) {
/* If datapath supports check_pkt_len, then
* xlate_check_pkt_larger() does the translation for the
* ofpacts following 'a'. */
exit = true;
}
Add a new OVS action check_pkt_larger This patch adds a new action 'check_pkt_larger' which checks if the packet is larger than the given size and stores the result in the destination register. Usage: check_pkt_larger(len)->REGISTER Eg. match=...,actions=check_pkt_larger(1442)->NXM_NX_REG0[0],next; This patch makes use of the new datapath action - 'check_pkt_len' which was recently added in the commit [1]. At the start of ovs-vswitchd, datapath is probed for this action. If the datapath action is present, then 'check_pkt_larger' makes use of this datapath action. Datapath action 'check_pkt_len' takes these nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER (optional) - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL (optional) - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. Let's say we have these flows added to an OVS bridge br-int table=0, priority=100 in_port=1,ip,actions=check_pkt_larger:100->NXM_NX_REG0[0],resubmit(,1) table=1, priority=200,in_port=1,ip,reg0=0x1/0x1 actions=output:3 table=1, priority=100,in_port=1,ip,actions=output:4 Then the action 'check_pkt_larger' will be translated as - check_pkt_len(size=100,gt(3),le(4)) datapath will check the packet length and if the packet length is greater than 100, it will output to port 3, else it will output to port 4. In case, datapath doesn't support 'check_pkt_len' action, the OVS action 'check_pkt_larger' sets SLOW_ACTION so that datapath flow is not added. This OVS action is intended to be used by OVN to check the packet length and generate an ICMP packet with type 3, code 4 and next hop mtu in the logical router pipeline if the MTU of the physical interface is lesser than the packet length. More information can be found here [2] [1] - https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 [2] - https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Numan Siddique <nusiddiq@redhat.com> CC: Ben Pfaff <blp@ovn.org> CC: Gregory Rose <gvrose8192@gmail.com> Acked-by: Mark Michelson <mmichels@redhat.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2019-04-23 00:53:38 +05:30
break;
}
}
/* Check if need to store this and the remaining actions for later
* execution. */
if (!ctx->error && ctx->exit && ctx_first_frozen_action(ctx)) {
freeze_unroll_actions(a, ofpact_end(ofpacts, ofpacts_len), ctx);
break;
}
}
}
void
xlate_in_init(struct xlate_in *xin, struct ofproto_dpif *ofproto,
ovs_version_t version, const struct flow *flow,
ofp_port_t in_port, struct rule_dpif *rule, uint16_t tcp_flags,
const struct dp_packet *packet, struct flow_wildcards *wc,
struct ofpbuf *odp_actions)
{
xin->ofproto = ofproto;
xin->tables_version = version;
xin->flow = *flow;
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
xin->upcall_flow = flow;
xin->flow.in_port.ofp_port = in_port;
xin->flow.actset_output = OFPP_UNSET;
xin->packet = packet;
xin->allow_side_effects = packet != NULL;
xin->rule = rule;
xin->xcache = NULL;
xin->ofpacts = NULL;
xin->ofpacts_len = 0;
xin->tcp_flags = tcp_flags;
xin->trace = NULL;
xin->resubmit_stats = NULL;
xin->depth = 0;
xin->resubmits = 0;
xin->wc = wc;
xin->odp_actions = odp_actions;
xin->in_packet_out = false;
xin->recirc_queue = NULL;
xin->xport_uuid = UUID_ZERO;
/* Do recirc lookup. */
xin->frozen_state = NULL;
if (flow->recirc_id) {
const struct recirc_id_node *node
= recirc_id_node_find(flow->recirc_id);
if (node) {
xin->frozen_state = &node->state;
}
}
}
void
xlate_out_uninit(struct xlate_out *xout)
{
if (xout) {
recirc_refs_unref(&xout->recircs);
}
}
static struct skb_priority_to_dscp *
get_skb_priority(const struct xport *xport, uint32_t skb_priority)
{
struct skb_priority_to_dscp *pdscp;
uint32_t hash;
hash = hash_int(skb_priority, 0);
HMAP_FOR_EACH_IN_BUCKET (pdscp, hmap_node, hash, &xport->skb_priorities) {
if (pdscp->skb_priority == skb_priority) {
return pdscp;
}
}
return NULL;
}
static bool
dscp_from_skb_priority(const struct xport *xport, uint32_t skb_priority,
uint8_t *dscp)
{
struct skb_priority_to_dscp *pdscp = get_skb_priority(xport, skb_priority);
*dscp = pdscp ? pdscp->dscp : 0;
return pdscp != NULL;
}
static size_t
count_skb_priorities(const struct xport *xport)
{
return hmap_count(&xport->skb_priorities);
}
static void
clear_skb_priorities(struct xport *xport)
{
struct skb_priority_to_dscp *pdscp;
HMAP_FOR_EACH_POP (pdscp, hmap_node, &xport->skb_priorities) {
free(pdscp);
}
}
static bool
actions_output_to_local_port(const struct xlate_ctx *ctx)
{
odp_port_t local_odp_port = ofp_port_to_odp_port(ctx->xbridge, OFPP_LOCAL);
const struct nlattr *a;
unsigned int left;
NL_ATTR_FOR_EACH_UNSAFE (a, left, ctx->odp_actions->data,
ctx->odp_actions->size) {
if (nl_attr_type(a) == OVS_ACTION_ATTR_OUTPUT
&& nl_attr_get_odp_port(a) == local_odp_port) {
return true;
}
}
return false;
}
#if defined(__linux__)
/* Returns the maximum number of packets that the Linux kernel is willing to
* queue up internally to certain kinds of software-implemented ports, or the
* default (and rarely modified) value if it cannot be determined. */
static int
netdev_max_backlog(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
static int max_backlog = 1000; /* The normal default value. */
if (ovsthread_once_start(&once)) {
static const char filename[] = "/proc/sys/net/core/netdev_max_backlog";
FILE *stream;
int n;
stream = fopen(filename, "r");
if (!stream) {
VLOG_INFO("%s: open failed (%s)", filename, ovs_strerror(errno));
} else {
if (fscanf(stream, "%d", &n) != 1) {
VLOG_WARN("%s: read error", filename);
} else if (n <= 100) {
VLOG_WARN("%s: unexpectedly small value %d", filename, n);
} else {
max_backlog = n;
}
fclose(stream);
}
ovsthread_once_done(&once);
VLOG_DBG("%s: using %d max_backlog", filename, max_backlog);
}
return max_backlog;
}
/* Counts and returns the number of OVS_ACTION_ATTR_OUTPUT actions in
* 'odp_actions'. */
static int
count_output_actions(const struct ofpbuf *odp_actions)
{
const struct nlattr *a;
size_t left;
int n = 0;
NL_ATTR_FOR_EACH_UNSAFE (a, left, odp_actions->data, odp_actions->size) {
userspace: Avoid dp_hash recirculation for balance-tcp bond mode. Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2020-05-22 10:50:05 +02:00
if ((a->nla_type == OVS_ACTION_ATTR_OUTPUT) ||
(a->nla_type == OVS_ACTION_ATTR_LB_OUTPUT)) {
n++;
}
}
return n;
}
#endif /* defined(__linux__) */
/* Returns true if 'odp_actions' contains more output actions than the datapath
* can reliably handle in one go. On Linux, this is the value of the
* net.core.netdev_max_backlog sysctl, which limits the maximum number of
* packets that the kernel is willing to queue up for processing while the
* datapath is processing a set of actions. */
static bool
too_many_output_actions(const struct ofpbuf *odp_actions OVS_UNUSED)
{
#ifdef __linux__
return (odp_actions->size / NL_A_U32_SIZE > netdev_max_backlog()
&& count_output_actions(odp_actions) > netdev_max_backlog());
#else
/* OSes other than Linux might have similar limits, but we don't know how
* to determine them.*/
return false;
#endif
}
static void
xlate_wc_init(struct xlate_ctx *ctx)
{
flow_wildcards_init_catchall(ctx->wc);
/* Some fields we consider to always be examined. */
WC_MASK_FIELD(ctx->wc, packet_type);
WC_MASK_FIELD(ctx->wc, in_port);
WC_MASK_FIELD(ctx->wc, dl_type);
if (is_ip_any(&ctx->xin->flow)) {
WC_MASK_FIELD_MASK(ctx->wc, nw_frag, FLOW_NW_FRAG_MASK);
}
if (ctx->xbridge->support.odp.recirc) {
/* Always exactly match recirc_id when datapath supports
* recirculation. */
WC_MASK_FIELD(ctx->wc, recirc_id);
}
if (ctx->xbridge->netflow) {
netflow_mask_wc(&ctx->xin->flow, ctx->wc);
}
tnl_wc_init(&ctx->xin->flow, ctx->wc);
}
static void
xlate_wc_finish(struct xlate_ctx *ctx)
{
int i;
/* Clear the metadata and register wildcard masks, because we won't
* use non-header fields as part of the cache. */
flow_wildcards_clear_non_packet_fields(ctx->wc);
/* Wildcard Ethernet address fields if the original packet type was not
* Ethernet.
*
* (The Ethertype field is used even when the original packet type is not
* Ethernet.) */
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
if (ctx->xin->upcall_flow->packet_type != htonl(PT_ETH)) {
ctx->wc->masks.dl_dst = eth_addr_zero;
ctx->wc->masks.dl_src = eth_addr_zero;
}
/* ICMPv4 and ICMPv6 have 8-bit "type" and "code" fields. struct flow
* uses the low 8 bits of the 16-bit tp_src and tp_dst members to
* represent these fields. The datapath interface, on the other hand,
* represents them with just 8 bits each. This means that if the high
* 8 bits of the masks for these fields somehow become set, then they
* will get chopped off by a round trip through the datapath, and
* revalidation will spot that as an inconsistency and delete the flow.
* Avoid the problem here by making sure that only the low 8 bits of
* either field can be unwildcarded for ICMP.
*/
if (is_icmpv4(&ctx->xin->flow, NULL) || is_icmpv6(&ctx->xin->flow, NULL)) {
ctx->wc->masks.tp_src &= htons(UINT8_MAX);
ctx->wc->masks.tp_dst &= htons(UINT8_MAX);
}
/* VLAN_TCI CFI bit must be matched if any of the TCI is matched. */
for (i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) {
if (ctx->wc->masks.vlans[i].tci) {
ctx->wc->masks.vlans[i].tci |= htons(VLAN_CFI);
}
}
/* The classifier might return masks that match on tp_src and tp_dst even
* for later fragments. This happens because there might be flows that
* match on tp_src or tp_dst without matching on the frag bits, because
* it is not a prerequisite for OpenFlow. Since it is a prerequisite for
* datapath flows and since tp_src and tp_dst are always going to be 0,
* wildcard the fields here. */
if (ctx->xin->flow.nw_frag & FLOW_NW_FRAG_LATER) {
ctx->wc->masks.tp_src = 0;
ctx->wc->masks.tp_dst = 0;
}
/* Clear flow wildcard bits for fields which are not present
* in the original packet header. These wildcards may get set
* due to push/set_field actions. This results into frequent
* invalidation of datapath flows by revalidator thread. */
/* Clear mpls label wc bits if original packet is non-mpls. */
if (!eth_type_mpls(ctx->xin->upcall_flow->dl_type)) {
for (i = 0; i < FLOW_MAX_MPLS_LABELS; i++) {
ctx->wc->masks.mpls_lse[i] = 0;
}
}
/* Clear vlan header wc bits if original packet does not have
* vlan header. */
for (i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) {
if (!eth_type_vlan(ctx->xin->upcall_flow->vlans[i].tpid)) {
ctx->wc->masks.vlans[i].tpid = 0;
ctx->wc->masks.vlans[i].tci = 0;
}
}
/* Clear tunnel wc bits if original packet is non-tunnel. */
if (!flow_tnl_dst_is_set(&ctx->xin->upcall_flow->tunnel)) {
memset(&ctx->wc->masks.tunnel, 0, sizeof ctx->wc->masks.tunnel);
}
}
/* This will optimize the odp actions generated. For now, it will remove
* trailing clone actions that are unnecessary. */
static void
xlate_optimize_odp_actions(struct xlate_in *xin)
{
struct ofpbuf *actions = xin->odp_actions;
struct nlattr *last_action = NULL;
struct nlattr *a;
int left;
if (!actions) {
return;
}
/* Find the last action in the set. */
NL_ATTR_FOR_EACH (a, left, actions->data, actions->size) {
last_action = a;
}
/* Remove the trailing clone() action, by directly embedding the nested
* actions. */
if (last_action && nl_attr_type(last_action) == OVS_ACTION_ATTR_CLONE) {
void *dest;
nl_msg_reset_size(actions,
(unsigned char *) last_action -
(unsigned char *) actions->data);
dest = nl_msg_put_uninit(actions, nl_attr_get_size(last_action));
memmove(dest, nl_attr_get(last_action), nl_attr_get_size(last_action));
}
}
/* Translates the flow, actions, or rule in 'xin' into datapath actions in
* 'xout'.
* The caller must take responsibility for eventually freeing 'xout', with
* xlate_out_uninit().
* Returns 'XLATE_OK' if translation was successful. In case of an error an
* empty set of actions will be returned in 'xin->odp_actions' (if non-NULL),
* so that most callers may ignore the return value and transparently install a
* drop flow when the translation fails. */
enum xlate_error
xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
{
*xout = (struct xlate_out) {
.slow = 0,
.recircs = RECIRC_REFS_EMPTY_INITIALIZER,
};
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
struct xbridge *xbridge = xbridge_lookup(xcfg, xin->ofproto);
if (!xbridge) {
return XLATE_BRIDGE_NOT_FOUND;
}
struct flow *flow = &xin->flow;
uint8_t stack_stub[1024];
uint64_t action_set_stub[1024 / 8];
uint64_t frozen_actions_stub[1024 / 8];
uint64_t actions_stub[256 / 8];
struct ofpbuf scratch_actions = OFPBUF_STUB_INITIALIZER(actions_stub);
struct xlate_ctx ctx = {
.xin = xin,
.xout = xout,
.base_flow = *flow,
.orig_tunnel_ipv6_dst = flow_tnl_dst(&flow->tunnel),
.xcfg = xcfg,
.xbridge = xbridge,
.stack = OFPBUF_STUB_INITIALIZER(stack_stub),
.rule = xin->rule,
.wc = (xin->wc
? xin->wc
: &(struct flow_wildcards) { .masks = { .dl_type = 0 } }),
.odp_actions = xin->odp_actions ? xin->odp_actions : &scratch_actions,
.depth = xin->depth,
.resubmits = xin->resubmits,
.in_action_set = false,
.in_packet_out = xin->in_packet_out,
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
.pending_encap = false,
.pending_decap = false,
.encap_data = NULL,
.table_id = 0,
.rule_cookie = OVS_BE64_MAX,
.orig_skb_priority = flow->skb_priority,
.sflow_n_outputs = 0,
.sflow_odp_port = 0,
.nf_output_iface = NF_OUT_DROP,
.exit = false,
.error = XLATE_OK,
.mirrors = 0,
.freezing = false,
.recirc_update_dp_hash = false,
.frozen_actions = OFPBUF_STUB_INITIALIZER(frozen_actions_stub),
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
.pause = NULL,
.was_mpls = false,
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
.conntracked = false,
.ct_nat_action = NULL,
.action_set_has_group = false,
.action_set = OFPBUF_STUB_INITIALIZER(action_set_stub),
};
/* 'base_flow' reflects the packet as it came in, but we need it to reflect
* the packet as the datapath will treat it for output actions. Our
* datapath doesn't retain tunneling information without us re-setting
* it, so clear the tunnel data.
*/
memset(&ctx.base_flow.tunnel, 0, sizeof ctx.base_flow.tunnel);
ofpbuf_reserve(ctx.odp_actions, NL_A_U32_SIZE);
xlate_wc_init(&ctx);
COVERAGE_INC(xlate_actions);
xin->trace = xlate_report(&ctx, OFT_BRIDGE, "bridge(\"%s\")",
xbridge->name);
if (xin->frozen_state) {
const struct frozen_state *state = xin->frozen_state;
struct ovs_list *old_trace = xin->trace;
xin->trace = xlate_report(&ctx, OFT_THAW, "thaw");
if (xin->ofpacts_len > 0 || ctx.rule) {
xlate_report_error(&ctx, "Recirculation conflict (%s)!",
xin->ofpacts_len ? "actions" : "rule");
ctx.error = XLATE_RECIRCULATION_CONFLICT;
goto exit;
}
/* Set the bridge for post-recirculation processing if needed. */
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
if (!uuid_equals(&ctx.xbridge->ofproto->uuid, &state->ofproto_uuid)) {
const struct xbridge *new_bridge
= xbridge_lookup_by_uuid(xcfg, &state->ofproto_uuid);
if (OVS_UNLIKELY(!new_bridge)) {
/* Drop the packet if the bridge cannot be found. */
xlate_report_error(&ctx, "Frozen bridge no longer exists.");
ctx.error = XLATE_BRIDGE_NOT_FOUND;
xin->trace = old_trace;
goto exit;
}
ctx.xbridge = new_bridge;
/* The bridge is now known so obtain its table version. */
ctx.xin->tables_version
= ofproto_dpif_get_tables_version(ctx.xbridge->ofproto);
}
/* Set the thawed table id. Note: A table lookup is done only if there
* are no frozen actions. */
ctx.table_id = state->table_id;
xlate_report(&ctx, OFT_THAW,
"Resuming from table %"PRIu8, ctx.table_id);
ctx.conntracked = state->conntracked;
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
if (!state->conntracked) {
clear_conntrack(&ctx);
Add support for connection tracking. This patch adds a new action and fields to OVS that allow connection tracking to be performed. This support works in conjunction with the Linux kernel support merged into the Linux-4.3 development cycle. Packets have two possible states with respect to connection tracking: Untracked packets have not previously passed through the connection tracker, while tracked packets have previously been through the connection tracker. For OpenFlow pipeline processing, untracked packets can become tracked, and they will remain tracked until the end of the pipeline. Tracked packets cannot become untracked. Connections can be unknown, uncommitted, or committed. Packets which are untracked have unknown connection state. To know the connection state, the packet must become tracked. Uncommitted connections have no connection state stored about them, so it is only possible for the connection tracker to identify whether they are a new connection or whether they are invalid. Committed connections have connection state stored beyond the lifetime of the packet, which allows later packets in the same connection to be identified as part of the same established connection, or related to an existing connection - for instance ICMP error responses. The new 'ct' action transitions the packet from "untracked" to "tracked" by sending this flow through the connection tracker. The following parameters are supported initally: - "commit": When commit is executed, the connection moves from uncommitted state to committed state. This signals that information about the connection should be stored beyond the lifetime of the packet within the pipeline. This allows future packets in the same connection to be recognized as part of the same "established" (est) connection, as well as identifying packets in the reply (rpl) direction, or packets related to an existing connection (rel). - "zone=[u16|NXM]": Perform connection tracking in the zone specified. Each zone is an independent connection tracking context. When the "commit" parameter is used, the connection will only be committed in the specified zone, and not in other zones. This is 0 by default. - "table=NUMBER": Fork pipeline processing in two. The original instance of the packet will continue processing the current actions list as an untracked packet. An additional instance of the packet will be sent to the connection tracker, which will be re-injected into the OpenFlow pipeline to resume processing in the specified table, with the ct_state and other ct match fields set. If the table is not specified, then the packet is submitted to the connection tracker, but the pipeline does not fork and the ct match fields are not populated. It is strongly recommended to specify a table later than the current table to prevent loops. When the "table" option is used, the packet that continues processing in the specified table will have the ct_state populated. The ct_state may have any of the following flags set: - Tracked (trk): Connection tracking has occurred. - Reply (rpl): The flow is in the reply direction. - Invalid (inv): The connection tracker couldn't identify the connection. - New (new): This is the beginning of a new connection. - Established (est): This is part of an already existing connection. - Related (rel): This connection is related to an existing connection. For more information, consult the ovs-ofctl(8) man pages. Below is a simple example flow table to allow outbound TCP traffic from port 1 and drop traffic from port 2 that was not initiated by port 1: table=0,priority=1,action=drop table=0,arp,action=normal table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2 table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1) table=1,in_port=2,ct_state=+trk+est,tcp,action=1 table=1,in_port=2,ct_state=+trk+new,tcp,action=drop Based on original design by Justin Pettit, contributions from Thomas Graf and Daniele Di Proietto. Signed-off-by: Joe Stringer <joestringer@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Ben Pfaff <blp@nicira.com>
2015-08-11 10:56:09 -07:00
}
/* Restore pipeline metadata. May change flow's in_port and other
* metadata to the values that existed when freezing was triggered. */
frozen_metadata_to_flow(&ctx.xbridge->ofproto->up,
&state->metadata, flow);
/* Restore stack, if any. */
if (state->stack) {
ofpbuf_put(&ctx.stack, state->stack, state->stack_size);
}
/* Restore mirror state. */
ctx.mirrors = state->mirrors;
/* Restore action set, if any. */
if (state->action_set_len) {
xlate_report_actions(&ctx, OFT_THAW, "Restoring action set",
state->action_set, state->action_set_len);
flow->actset_output = OFPP_UNSET;
xlate_write_actions__(&ctx, state->action_set,
state->action_set_len);
}
/* Restore frozen actions. If there are no actions, processing will
* start with a lookup in the table set above. */
xin->ofpacts = state->ofpacts;
xin->ofpacts_len = state->ofpacts_len;
if (state->ofpacts_len) {
xlate_report_actions(&ctx, OFT_THAW, "Restoring actions",
xin->ofpacts, xin->ofpacts_len);
}
xin->trace = old_trace;
} else if (OVS_UNLIKELY(flow->recirc_id)) {
xlate_report_error(&ctx,
"Recirculation context not found for ID %"PRIx32,
flow->recirc_id);
ctx.error = XLATE_NO_RECIRCULATION_CONTEXT;
goto exit;
}
if (!xin->frozen_state
&& xin->flow.ct_state
&& xin->flow.ct_state & CS_TRACKED) {
ctx.conntracked = true;
}
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
/* Tunnel metadata in udpif format must be normalized before translation. */
if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
const struct tun_table *tun_tab = ofproto_get_tun_tab(
&ctx.xbridge->ofproto->up);
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
int err;
err = tun_metadata_from_geneve_udpif(tun_tab, &xin->upcall_flow->tunnel,
&xin->upcall_flow->tunnel,
&flow->tunnel);
if (err) {
xlate_report_error(&ctx, "Invalid Geneve tunnel metadata");
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
ctx.error = XLATE_INVALID_TUNNEL_METADATA;
goto exit;
}
} else if (!flow->tunnel.metadata.tab) {
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
/* If the original flow did not come in on a tunnel, then it won't have
* FLOW_TNL_F_UDPIF set. However, we still need to have a metadata
* table in case we generate tunnel actions. */
flow->tunnel.metadata.tab = ofproto_get_tun_tab(
&ctx.xbridge->ofproto->up);
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
}
ctx.wc->masks.tunnel.metadata.tab = flow->tunnel.metadata.tab;
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
/* Get the proximate input port of the packet. (If xin->frozen_state,
* flow->in_port is the ultimate input port of the packet.) */
struct xport *in_port = get_ofp_port(xbridge,
ctx.base_flow.in_port.ofp_port);
if (in_port && !in_port->peer) {
ctx.xin->xport_uuid = in_port->uuid;
}
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
if (flow->packet_type != htonl(PT_ETH) && in_port &&
in_port->pt_mode == NETDEV_PT_LEGACY_L3 && ctx.table_id == 0) {
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
/* Add dummy Ethernet header to non-L2 packet if it's coming from a
* L3 port. So all packets will be L2 packets for lookup.
* The dl_type has already been set from the packet_type. */
flow->packet_type = htonl(PT_ETH);
flow->dl_src = eth_addr_zero;
flow->dl_dst = eth_addr_zero;
OF support and translation of generic encap and decap This commit adds support for the OpenFlow actions generic encap and decap (as specified in ONF EXT-382) to the OVS control plane. CLI syntax for encap action with properties: encap(<header>) encap(<header>(<prop>=<value>,<tlv>(<class>,<type>,<value>),...)) For example: encap(ethernet) encap(nsh(md_type=1)) encap(nsh(md_type=2,tlv(0x1000,10,0x12345678),tlv(0x2000,20,0xfedcba9876543210))) CLI syntax for decap action: decap() decap(packet_type(ns=<pt_ns>,type=<pt_type>)) For example: decap() decap(packet_type(ns=0,type=0xfffe)) decap(packet_type(ns=1,type=0x894f)) The first header supported for encap and decap is "ethernet" to convert packets between packet_type (1,Ethertype) and (0,0). This commit also implements a skeleton for the translation of generic encap and decap actions in ofproto-dpif and adds support to encap and decap an Ethernet header. In general translation of encap commits pending actions and then rewrites struct flow in accordance with the new packet type and header. In the case of encap(ethernet) it suffices to change the packet type from (1, Ethertype) to (0,0) and set the dl_type accordingly. A new pending_encap flag in xlate ctx is set to mark that an corresponding datapath encap action must be triggered at the next commit. In the case of encap(ethernet) ofproto generetas a push_eth action. The general case for translation of decap() is to emit a datapath action to decap the current outermost header and then recirculate the packet to reparse the inner headers. In the special case of an Ethernet packet, decap() just changes the packet type from (0,0) to (1, dl_type) without a need to recirculate. The emission of the pop_eth action for the datapath is postponed to the next commit. Hence encap(ethernet) and decap() on an Ethernet packet are OF octions that only incur a cost in the dataplane when a modifed packet is actually committed, e.g. because it is sent out. They can freely be used for normalizing the packet type in the OF pipeline without degrading performance. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-08-02 16:04:12 +08:00
ctx.pending_encap = true;
userspace: Switching of L3 packets in L2 pipeline Ports have a new layer3 attribute if they send/receive L3 packets. The packet_type included in structs dp_packet and flow is considered in ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite. A dummy ethernet header is pushed to L3 packets received from L3 ports before the the pipeline processing starts. The ethernet header is popped before sending a packet to a L3 port. For datapath ports that can receive L2 or L3 packets, the packet_type becomes part of the flow key for datapath flows and is handled appropriately in dpif-netdev. In the 'else' branch in flow_put_on_pmd() function, the additional check flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls lookup is sufficient to uniquely identify a flow and b) it caused false negatives because the flow in netdev->flow may not properly masked. In dpif_netdev_flow_put() we now use the same method for constructing the netdev_flow_key as the one used when adding the flow to the dplcs to make sure these always match. The function netdev_flow_key_from_flow() used so far was not only inefficient but sometimes caused mismatches and subsequent flow update failures. The kernel datapath does not support the packet_type match field. Instead it encodes the packet type implictly by the presence or absence of the Ethernet attribute in the flow key and mask. This patch filters the PACKET_TYPE attribute out of netlink flow key and mask to be sent to the kernel datapath. Signed-off-by: Lorand Jakab <lojakab@cisco.com> Signed-off-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: Jiri Benc <jbenc@redhat.com> Signed-off-by: Yi Yang <yi.y.yang@intel.com> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2017-06-02 16:16:17 +00:00
}
if (!xin->ofpacts && !ctx.rule) {
ctx.rule = rule_dpif_lookup_from_table(
ctx.xbridge->ofproto, ctx.xin->tables_version, flow, ctx.wc,
ctx.xin->resubmit_stats, &ctx.table_id,
flow->in_port.ofp_port, true, true, ctx.xin->xcache);
if (ctx.xin->resubmit_stats) {
Add offload packets statistics Add argument '--offload-stats' for command ovs-appctl bridge/dump-flows to display the offloaded packets statistics. The commands display as below: orignal command: ovs-appctl bridge/dump-flows br0 duration=574s, n_packets=1152, n_bytes=110768, priority=0,actions=NORMAL table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=2,recirc_id=0,actions=drop table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x1,actions=controller(reason=) table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x2,actions=drop table_id=254, duration=574s, n_packets=0, n_bytes=0, priority=0,reg0=0x3,actions=drop new command with argument '--offload-stats' Notice: 'n_offload_packets' are a subset of n_packets and 'n_offload_bytes' are a subset of n_bytes. ovs-appctl bridge/dump-flows --offload-stats br0 duration=582s, n_packets=1152, n_bytes=110768, n_offload_packets=1107, n_offload_bytes=107992, priority=0,actions=NORMAL table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=2,recirc_id=0,actions=drop table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x1,actions=controller(reason=) table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x2,actions=drop table_id=254, duration=582s, n_packets=0, n_bytes=0, n_offload_packets=0, n_offload_bytes=0, priority=0,reg0=0x3,actions=drop Signed-off-by: zhaozhanxu <zhaozhanxu@163.com> Signed-off-by: Simon Horman <simon.horman@netronome.com>
2019-12-05 14:26:25 +08:00
rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats, false);
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
entry->rule = ctx.rule;
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
ofproto_rule_ref(&ctx.rule->up);
}
xlate_report_table(&ctx, ctx.rule, ctx.table_id);
}
/* Tunnel stats only for not-thawed packets. */
if (!xin->frozen_state && in_port && in_port->is_tunnel) {
if (ctx.xin->resubmit_stats) {
netdev_vport_inc_rx(in_port->netdev, ctx.xin->resubmit_stats);
if (in_port->bfd) {
bfd_account_rx(in_port->bfd, ctx.xin->resubmit_stats);
}
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_NETDEV);
entry->dev.rx = netdev_ref(in_port->netdev);
entry->dev.bfd = bfd_ref(in_port->bfd);
}
}
if (!xin->frozen_state && process_special(&ctx, in_port)) {
/* process_special() did all the processing for this packet.
*
* We do not perform special processing on thawed packets, since that
* was done before they were frozen and should not be redone. */
mirror_ingress_packet(&ctx);
} else if (in_port && in_port->xbundle
&& xbundle_mirror_out(xbridge, in_port->xbundle)) {
xlate_report_error(&ctx, "dropping packet received on port "
"%s, which is reserved exclusively for mirroring",
in_port->xbundle->name);
} else {
/* Sampling is done on initial reception; don't redo after thawing. */
unsigned int user_cookie_offset = 0;
if (!xin->frozen_state) {
user_cookie_offset = compose_sflow_action(&ctx);
compose_ipfix_action(&ctx, ODPP_NONE);
}
size_t sample_actions_len = ctx.odp_actions->size;
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
bool ecn_drop = !tnl_process_ecn(flow);
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
if (!ecn_drop
&& (!in_port || may_receive(in_port, &ctx))) {
const struct ofpact *ofpacts;
size_t ofpacts_len;
if (xin->ofpacts) {
ofpacts = xin->ofpacts;
ofpacts_len = xin->ofpacts_len;
} else if (ctx.rule) {
const struct rule_actions *actions
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
= rule_get_actions(&ctx.rule->up);
ofpacts = actions->ofpacts;
ofpacts_len = actions->ofpacts_len;
ofproto-dpif: Unhide structure contents. Until now, ofproto-dpif.c has hidden the definitions of several structures, such as struct ofproto_dpif and struct rule_dpif. This kind of information hiding is often beneficial, because it forces code outside the file with the definition to use the documented interfaces. However, in this case it was starting to burden ofproto-dpif with an increasing number of trivial helpers that were not improving or maintaining a useful abstraction and that were making code harder to maintain and read. Information hiding also made it hard to move blocks of code outside ofproto-dpif.c itself, since any code moved out often needed new helpers if it used anything that wasn't previously exposed. In the present instance, upcoming patches will move code for tracing outside ofproto-dpif, and this would require adding several helpers that would just obscure the function of the code otherwise needlessly. In balance, it seems that there is more harm than good in the information hiding here, so this commit moves the definitions of several structures from ofproto-dpif.c into ofproto-dpif.h. It also removes all of the trivial helpers that had accumulated, instead changing their users to directly access the members that they needed. It also reorganizes ofproto-dpif.h, grouping structure definitions and function prototypes in a sensible way. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com> Acked-by: Justin Pettit <jpettit@ovn.org>
2016-12-06 14:08:42 -08:00
ctx.rule_cookie = ctx.rule->up.flow_cookie;
} else {
OVS_NOT_REACHED();
}
ofproto-dpif-xlate: Rewrite mirroring to better fit flow translation. Until now, mirroring has been implemented by accumulating, across the whole translation process, a set of mirrors that should receive a mirrored packet. After translation was complete, mirroring restored the original version of the packet and sent that version to the mirrors. That implementation was ugly for multiple reasons. First, it means that we have to keep a copy of the original packet (or its headers, actually), which is expensive. Second, it doesn't really make sense to mirror a version of a packet that is different from the one originally output. Third, it interacted with recirculation; mirroring needed to happen only after recirculation was complete, but this was never properly implemented, so that (I think) mirroring never happened for packets that were recirculated. This commit changes how mirroring works. Now, a packet is mirrored at the point in translation when it becomes eligible for it: for mirrors based on ingress port, this is at ingress; for mirrors based on egress port, this is at egress. (Duplicates are dropped.) Mirroring happens on the version of the packet as it exists when it becomes eligible. Finally, since mirroring happens immediately, it interacts better with recirculation (it still isn't perfect, since duplicate mirroring will occur if a packet is eligible for mirroring both before and after recirculation; this is not difficult to fix and an upcoming commit later in this series will do so). Finally, this commit removes more code from xlate_actions() than it adds, which in my opinion makes it easier to understand. Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
2015-07-29 17:00:49 -07:00
mirror_ingress_packet(&ctx);
ofproto-dpif-xlate: Fix packet_in reason for Table-miss rule Currently in OvS if we hit "Table-miss" rules (associated with Controller action) then we send PACKET_IN message to controller with reason as OFPR_NO_MATCH. “Table-miss” rule is one whose priority is 0 and its catch all rule. But if we hit same "Table-miss" rule after executing group entry we will send the reason as OFPR_ACTION (for OF1.3 and below) and OFPR_GROUP (for OF1.4 and above). This is because once we execute group entry we set ctx->in_group and later when we hit the "Table-miss" rule, Since ctx->in_group is set we send reason as OFPR_ACTION (for OF1.3) and OFPR_GROUP (for OF1.4 and above). For eg: for the following pipeline, we will send the reason as OFPR_ACTION even if we hit The “Table-miss” rule. cookie=0x8000000, duration=761.189s, table=0, n_packets=1401, n_bytes=67954, priority=4,in_port=9,vlan_tci=0x0000/0x1fff actions=write_metadata:0x67870000000000/0xffffff0000000001,goto_table:17 cookie=0x6800001, duration=768.848s, table=17, n_packets=1418, n_bytes=68776, priority=10,metadata=0x67870000000000/0xffffff0000000000 actions=write_metadata:0xe067870000000000/0xfffffffffffffffe,goto_table:60 cookie=0x6800000, duration=24944.312s, table=60, n_packets=58244, n_bytes=2519520, priority=0 actions=resubmit(,17) cookie=0x8040000, duration=785.733s, table=17, n_packets=1450, n_bytes=69724, priority=10,metadata=0xe067870000000000/0xffffff0000000000 actions=write_metadata:0x67871d4d000000/0xfffffffffffffffe,goto_table:43 cookie=0x822002d, duration=24960.795s, table=43, n_packets=53097, n_bytes=2230074, priority=100,arp,arp_op=1 actions=group:6000 group_id=6000,type=all,bucket=actions=CONTROLLER:65535, bucket=actions=resubmit(,48), bucket=actions=resubmit(,81) cookie=0x8500000, duration=24977.323s, table=48, n_packets=58309, n_bytes=2522634, priority=0 actions=resubmit(,49),resubmit(,50) cookie=0x8050000, duration=24984.679s, table=50, n_packets=6, n_bytes=264, priority=0 actions=CONTROLLER:65535 Currently we are sending table_id as 50 and packet_in reason as OFPR_ACTION. Instead of sending packet_in reason as OFPR_NO_MATCH. Signed-off-by: Keshav Gupta <keshav.gupta@ericsson.com> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
2018-07-04 20:40:53 +05:30
do_xlate_actions(ofpacts, ofpacts_len, &ctx, true, false);
if (ctx.error) {
goto exit;
}
/* We've let OFPP_NORMAL and the learning action look at the
* packet, so cancel all actions and freezing if forwarding is
* disabled. */
if (in_port && (!xport_stp_forward_state(in_port) ||
!xport_rstp_forward_state(in_port))) {
ctx.odp_actions->size = sample_actions_len;
ctx_cancel_freeze(&ctx);
ofpbuf_clear(&ctx.action_set);
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
ctx.error = XLATE_FORWARDING_DISABLED;
}
if (!ctx.freezing) {
xlate_action_set(&ctx);
}
if (ctx.freezing) {
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
finish_freezing(&ctx);
}
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
} else if (ecn_drop) {
ctx.error = XLATE_CONGESTION_DROP;
}
/* Output only fully processed packets. */
if (!ctx.freezing
&& xbridge->has_in_band
&& in_band_must_output_to_local_port(flow)
&& !actions_output_to_local_port(&ctx)) {
WC_MASK_FIELD(ctx.wc, nw_proto);
WC_MASK_FIELD(ctx.wc, tp_src);
WC_MASK_FIELD(ctx.wc, tp_dst);
WC_MASK_FIELD(ctx.wc, dl_type);
xlate_report(&ctx, OFT_DETAIL, "outputting DHCP packet "
"to local port for in-band control");
compose_output_action(&ctx, OFPP_LOCAL, NULL, false, false);
}
if (user_cookie_offset) {
fix_sflow_action(&ctx, user_cookie_offset);
}
}
if (nl_attr_oversized(ctx.odp_actions->size)) {
/* These datapath actions are too big for a Netlink attribute, so we
* can't hand them to the kernel directly. dpif_execute() can execute
* them one by one with help, so just mark the result as SLOW_ACTION to
* prevent the flow from being installed. */
COVERAGE_INC(xlate_actions_oversize);
ctx.xout->slow |= SLOW_ACTION;
} else if (too_many_output_actions(ctx.odp_actions)) {
COVERAGE_INC(xlate_actions_too_many_output);
ctx.xout->slow |= SLOW_ACTION;
}
/* Update NetFlow for non-frozen traffic. */
if (xbridge->netflow && !xin->frozen_state) {
if (ctx.xin->resubmit_stats) {
netflow_flow_update(xbridge->netflow, flow,
ctx.nf_output_iface,
ctx.xin->resubmit_stats);
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_NETFLOW);
entry->nf.netflow = netflow_ref(xbridge->netflow);
entry->nf.flow = xmemdup(flow, sizeof *flow);
entry->nf.iface = ctx.nf_output_iface;
}
}
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
/* Translate tunnel metadata masks to udpif format if necessary. */
if (xin->upcall_flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
if (ctx.wc->masks.tunnel.metadata.present.map) {
const struct flow_tnl *upcall_tnl = &xin->upcall_flow->tunnel;
struct geneve_opt opts[TLV_TOT_OPT_SIZE /
sizeof(struct geneve_opt)];
tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
&ctx.wc->masks.tunnel,
upcall_tnl->metadata.opts.gnv,
upcall_tnl->metadata.present.len,
opts);
memset(&ctx.wc->masks.tunnel.metadata, 0,
sizeof ctx.wc->masks.tunnel.metadata);
memcpy(&ctx.wc->masks.tunnel.metadata.opts.gnv, opts,
upcall_tnl->metadata.present.len);
}
ctx.wc->masks.tunnel.metadata.present.len = 0xff;
ctx.wc->masks.tunnel.metadata.tab = NULL;
ctx.wc->masks.tunnel.flags |= FLOW_TNL_F_UDPIF;
} else if (!xin->upcall_flow->tunnel.metadata.tab) {
/* If we didn't have options in UDPIF format and didn't have an existing
* metadata table, then it means that there were no options at all when
* we started processing and any wildcards we picked up were from
* action generation. Without options on the incoming packet, wildcards
* aren't meaningful. To avoid them possibly getting misinterpreted,
* just clear everything. */
if (ctx.wc->masks.tunnel.metadata.present.map) {
memset(&ctx.wc->masks.tunnel.metadata, 0,
sizeof ctx.wc->masks.tunnel.metadata);
} else {
ctx.wc->masks.tunnel.metadata.tab = NULL;
}
}
xlate_wc_finish(&ctx);
exit:
tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis. When using tunnel TLVs (at the moment, this means Geneve options), a controller must first map the class and type onto an appropriate OXM field so that it can be used in OVS flow operations. This table is managed using OpenFlow extensions. The original code that added support for TLVs made the mapping table global as a simplification. However, this is not really logically correct as the OpenFlow management commands are operating on a per-bridge basis. This removes the original limitation to make the table per-bridge. One nice result of this change is that it is generally clearer whether the tunnel metadata is in datapath or OpenFlow format. Rather than allowing ad-hoc format changes and trying to handle both formats in the tunnel metadata functions, the format is more clearly separated by function. Datapaths (both kernel and userspace) use datapath format and it is not changed during the upcall process. At the beginning of action translation, tunnel metadata is converted to OpenFlow format and flows and wildcards are translated back at the end of the process. As an additional benefit, this change improves performance in some flow setup situations by keeping the tunnel metadata in the original packet format in more cases. This helps when copies need to be made as the amount of data touched is only what is present in the packet rather than the maximum amount of metadata supported. Co-authored-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Madhu Challa <challa@noironetworks.com> Signed-off-by: Jesse Gross <jesse@kernel.org> Acked-by: Ben Pfaff <blp@ovn.org>
2016-04-19 18:36:04 -07:00
/* Reset the table to what it was when we came in. If we only fetched
* it locally, then it has no meaning outside of flow translation. */
flow->tunnel.metadata.tab = xin->upcall_flow->tunnel.metadata.tab;
ofpbuf_uninit(&ctx.stack);
ofpbuf_uninit(&ctx.action_set);
ofpbuf_uninit(&ctx.frozen_actions);
ofpbuf_uninit(&scratch_actions);
ofpbuf_delete(ctx.encap_data);
/* Make sure we return a "drop flow" in case of an error. */
if (ctx.error) {
xout->slow = 0;
if (xin->odp_actions) {
ofpbuf_clear(xin->odp_actions);
}
} else {
/* In the non-error case, see if we can further optimize the datapath
* rules by removing redundant (clone) actions. */
xlate_optimize_odp_actions(xin);
}
userspace: Improved packet drop statistics. Currently OVS maintains explicit packet drop/error counters only on port level. Packets that are dropped as part of normal OpenFlow processing are counted in flow stats of “drop” flows or as table misses in table stats. These can only be interpreted by controllers that know the semantics of the configured OpenFlow pipeline. Without that knowledge, it is impossible for an OVS user to obtain e.g. the total number of packets dropped due to OpenFlow rules. Furthermore, there are numerous other reasons for which packets can be dropped by OVS slow path that are not related to the OpenFlow pipeline. The generated datapath flow entries include a drop action to avoid further expensive upcalls to the slow path, but subsequent packets dropped by the datapath are not accounted anywhere. Finally, the datapath itself drops packets in certain error situations. Also, these drops are today not accounted for.This makes it difficult for OVS users to monitor packet drop in an OVS instance and to alert a management system in case of a unexpected increase of such drops. Also OVS trouble-shooters face difficulties in analysing packet drops. With this patch we implement following changes to address the issues mentioned above. 1. Identify and account all the silent packet drop scenarios 2. Display these drops in ovs-appctl coverage/show Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Co-authored-by: Keshav Gupta <keshugupta1@gmail.com> Signed-off-by: Anju Thomas <anju.thomas@ericsson.com> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com> Signed-off-by: Keshav Gupta <keshugupta1@gmail.com> Acked-by: Eelco Chaudron <echaudro@redhat.com Acked-by: Ben Pfaff <blp@ovn.org> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2019-12-18 05:48:12 +01:00
/* Install drop action if datapath supports explicit drop action. */
if (xin->odp_actions && !xin->odp_actions->size &&
ovs_explicit_drop_action_supported(ctx.xbridge->ofproto)) {
put_drop_action(xin->odp_actions, ctx.error);
}
/* Since congestion drop and forwarding drop are not exactly
* translation error, we are resetting the translation error.
*/
if (ctx.error == XLATE_CONGESTION_DROP ||
ctx.error == XLATE_FORWARDING_DISABLED) {
ctx.error = XLATE_OK;
}
return ctx.error;
}
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
enum ofperr
xlate_resume(struct ofproto_dpif *ofproto,
const struct ofputil_packet_in_private *pin,
struct ofpbuf *odp_actions,
enum slow_path_reason *slow,
struct flow *flow,
struct xlate_cache *xcache)
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
{
struct dp_packet packet;
dp_packet_use_const(&packet, pin->base.packet,
pin->base.packet_len);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
pkt_metadata_from_flow(&packet.md, &pin->base.flow_metadata.flow);
flow_extract(&packet, flow);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
struct xlate_in xin;
xlate_in_init(&xin, ofproto, ofproto_dpif_get_tables_version(ofproto),
flow, 0, NULL, ntohs(flow->tcp_flags),
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
&packet, NULL, odp_actions);
xin.xcache = xcache;
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
struct ofpact_note noop;
ofpact_init_NOTE(&noop);
noop.length = 0;
bool any_actions = pin->actions_len > 0;
struct frozen_state state = {
.table_id = 0, /* Not the table where NXAST_PAUSE was executed. */
.ofproto_uuid = pin->bridge,
.stack = pin->stack,
.stack_size = pin->stack_size,
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
.mirrors = pin->mirrors,
.conntracked = pin->conntracked,
.xport_uuid = UUID_ZERO,
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
/* When there are no actions, xlate_actions() will search the flow
* table. We don't want it to do that (we want it to resume), so
* supply a no-op action if there aren't any.
*
* (We can't necessarily avoid translating actions entirely if there
* aren't any actions, because there might be some finishing-up to do
* at the end of the pipeline, and we don't check for those
* conditions.) */
.ofpacts = any_actions ? pin->actions : &noop.ofpact,
.ofpacts_len = any_actions ? pin->actions_len : sizeof noop,
.action_set = pin->action_set,
.action_set_len = pin->action_set_len,
};
frozen_metadata_from_flow(&state.metadata,
&pin->base.flow_metadata.flow);
Implement serializing the state of packet traversal in "continuations". One purpose of OpenFlow packet-in messages is to allow a controller to interpose on the path of a packet through the flow tables. If, for example, the controller needs to modify a packet in some way that the switch doesn't directly support, the controller should be able to program the switch to send it the packet, then modify the packet and send it back to the switch to continue through the flow table. That's the theory. In practice, this doesn't work with any but the simplest flow tables. Packet-in messages simply don't include enough context to allow the flow table traversal to continue. For example: * Via "resubmit" actions, an Open vSwitch packet can have an effective "call stack", but a packet-in can't describe it, and so it would be lost. * A packet-in can't preserve the stack used by NXAST_PUSH and NXAST_POP actions. * A packet-in can't preserve the OpenFlow 1.1+ action set. * A packet-in can't preserve the state of Open vSwitch mirroring or connection tracking. This commit introduces a solution called "continuations". A continuation is the state of a packet's traversal through OpenFlow flow tables. A "controller" action with the "pause" flag, which is newly implemented in this commit, generates a continuation and sends it to the OpenFlow controller in a packet-in asynchronous message (only NXT_PACKET_IN2 supports continuations, so the controller must configure them with NXT_SET_PACKET_IN_FORMAT). The controller processes the packet-in, possibly modifying some of its data, and sends it back to the switch with an NXT_RESUME request, which causes flow table traversal to continue. In principle, a single packet can be paused and resumed multiple times. Another way to look at it is: - "pause" is an extension of the existing OFPAT_CONTROLLER action. It sends the packet to the controller, with full pipeline context (some of which is switch implementation dependent, and may thus vary from switch to switch). - A continuation is an extension of OFPT_PACKET_IN, allowing for implementation dependent metadata. - NXT_RESUME is an extension of OFPT_PACKET_OUT, with the semantics that the pipeline processing is continued with the original translation context from where it was left at the time it was paused. Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Jarno Rajahalme <jarno@ovn.org>
2016-02-19 16:10:06 -08:00
xin.frozen_state = &state;
struct xlate_out xout;
enum xlate_error error = xlate_actions(&xin, &xout);
*slow = xout.slow;
xlate_out_uninit(&xout);
/* xlate_actions() can generate a number of errors, but only
* XLATE_BRIDGE_NOT_FOUND really stands out to me as one that we should be
* sure to report over OpenFlow. The others could come up in packet-outs
* or regular flow translation and I don't think that it's going to be too
* useful to report them to the controller. */
return error == XLATE_BRIDGE_NOT_FOUND ? OFPERR_NXR_STALE : 0;
}
/* Sends 'packet' out 'ofport'. If 'port' is a tunnel and that tunnel type
* supports a notion of an OAM flag, sets it if 'oam' is true.
* May modify 'packet'.
* Returns 0 if successful, otherwise a positive errno value. */
int
xlate_send_packet(const struct ofport_dpif *ofport, bool oam,
struct dp_packet *packet)
{
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
struct xport *xport;
uint64_t ofpacts_stub[1024 / 8];
struct ofpbuf ofpacts;
struct flow flow;
ofpbuf_use_stack(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
/* Use OFPP_NONE as the in_port to avoid special packet processing. */
flow_extract(packet, &flow);
flow.in_port.ofp_port = OFPP_NONE;
xport = xport_lookup(xcfg, ofport);
if (!xport) {
return EINVAL;
}
if (oam) {
const ovs_be16 flag = htons(NX_TUN_FLAG_OAM);
ofpact_put_set_field(&ofpacts, mf_from_id(MFF_TUN_FLAGS),
&flag, &flag);
}
ofpact_put_OUTPUT(&ofpacts)->port = xport->ofp_port;
/* Actions here are not referring to anything versionable (flow tables or
* groups) so we don't need to worry about the version here. */
return ofproto_dpif_execute_actions(xport->xbridge->ofproto,
OVS_VERSION_MAX, &flow, NULL,
ofpacts.data, ofpacts.size, packet);
}
ofproto-dpif: APIs and CLI option to add/delete static fdb entry. Currently there is an option to add/flush/show ARP/ND neighbor. This covers L3 side. For L2 side, there is only fdb show command. This commit gives an option to add/del an fdb entry via ovs-appctl. CLI command looks like: To add: ovs-appctl fdb/add <bridge> <port> <vlan> <Mac> ovs-appctl fdb/add br0 p1 0 50:54:00:00:00:05 To del: ovs-appctl fdb/del <bridge> <vlan> <Mac> ovs-appctl fdb/del br0 0 50:54:00:00:00:05 Added two new APIs to provide convenient interface to add and delete static-macs. bool xlate_add_static_mac_entry(const struct ofproto_dpif *, ofp_port_t in_port, struct eth_addr dl_src, int vlan); bool xlate_delete_static_mac_entry(const struct ofproto_dpif *, struct eth_addr dl_src, int vlan); 1. Static entry should not age. To indicate that entry being programmed is a static entry, 'expires' field in 'struct mac_entry' will be set to a MAC_ENTRY_AGE_STATIC_ENTRY. A check for this value is made while deleting mac entry as part of regular aging process. 2. Another change to the mac-update logic, when a packet with same dl_src as that of a static-mac entry arrives on any port, the logic will not modify the expires field. 3. While flushing fdb entries, made sure static ones are not evicted. 4. Updated "ovs-appctl fdb/stats-show br0" to display number of static entries in switch Added following tests: ofproto-dpif - static-mac add/del/flush ofproto-dpif - static-mac mac moves Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2019-June/048894.html Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1597752 Signed-off-by: Vasu Dasari <vdasari@gmail.com> Tested-by: Eelco Chaudron <echaudro@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-06-29 16:43:39 -04:00
/* Get xbundle for a ofp_port in a ofproto datapath. */
static struct xbundle*
ofp_port_to_xbundle(const struct ofproto_dpif *ofproto, ofp_port_t ofp_port)
{
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
struct xbridge *xbridge;
xbridge = xbridge_lookup(xcfg, ofproto);
if (!xbridge) {
ofproto-dpif: APIs and CLI option to add/delete static fdb entry. Currently there is an option to add/flush/show ARP/ND neighbor. This covers L3 side. For L2 side, there is only fdb show command. This commit gives an option to add/del an fdb entry via ovs-appctl. CLI command looks like: To add: ovs-appctl fdb/add <bridge> <port> <vlan> <Mac> ovs-appctl fdb/add br0 p1 0 50:54:00:00:00:05 To del: ovs-appctl fdb/del <bridge> <vlan> <Mac> ovs-appctl fdb/del br0 0 50:54:00:00:00:05 Added two new APIs to provide convenient interface to add and delete static-macs. bool xlate_add_static_mac_entry(const struct ofproto_dpif *, ofp_port_t in_port, struct eth_addr dl_src, int vlan); bool xlate_delete_static_mac_entry(const struct ofproto_dpif *, struct eth_addr dl_src, int vlan); 1. Static entry should not age. To indicate that entry being programmed is a static entry, 'expires' field in 'struct mac_entry' will be set to a MAC_ENTRY_AGE_STATIC_ENTRY. A check for this value is made while deleting mac entry as part of regular aging process. 2. Another change to the mac-update logic, when a packet with same dl_src as that of a static-mac entry arrives on any port, the logic will not modify the expires field. 3. While flushing fdb entries, made sure static ones are not evicted. 4. Updated "ovs-appctl fdb/stats-show br0" to display number of static entries in switch Added following tests: ofproto-dpif - static-mac add/del/flush ofproto-dpif - static-mac mac moves Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2019-June/048894.html Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1597752 Signed-off-by: Vasu Dasari <vdasari@gmail.com> Tested-by: Eelco Chaudron <echaudro@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-06-29 16:43:39 -04:00
return NULL;
}
ofproto-dpif: APIs and CLI option to add/delete static fdb entry. Currently there is an option to add/flush/show ARP/ND neighbor. This covers L3 side. For L2 side, there is only fdb show command. This commit gives an option to add/del an fdb entry via ovs-appctl. CLI command looks like: To add: ovs-appctl fdb/add <bridge> <port> <vlan> <Mac> ovs-appctl fdb/add br0 p1 0 50:54:00:00:00:05 To del: ovs-appctl fdb/del <bridge> <vlan> <Mac> ovs-appctl fdb/del br0 0 50:54:00:00:00:05 Added two new APIs to provide convenient interface to add and delete static-macs. bool xlate_add_static_mac_entry(const struct ofproto_dpif *, ofp_port_t in_port, struct eth_addr dl_src, int vlan); bool xlate_delete_static_mac_entry(const struct ofproto_dpif *, struct eth_addr dl_src, int vlan); 1. Static entry should not age. To indicate that entry being programmed is a static entry, 'expires' field in 'struct mac_entry' will be set to a MAC_ENTRY_AGE_STATIC_ENTRY. A check for this value is made while deleting mac entry as part of regular aging process. 2. Another change to the mac-update logic, when a packet with same dl_src as that of a static-mac entry arrives on any port, the logic will not modify the expires field. 3. While flushing fdb entries, made sure static ones are not evicted. 4. Updated "ovs-appctl fdb/stats-show br0" to display number of static entries in switch Added following tests: ofproto-dpif - static-mac add/del/flush ofproto-dpif - static-mac mac moves Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2019-June/048894.html Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1597752 Signed-off-by: Vasu Dasari <vdasari@gmail.com> Tested-by: Eelco Chaudron <echaudro@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-06-29 16:43:39 -04:00
return lookup_input_bundle__(xbridge, ofp_port, NULL);
}
void
xlate_mac_learning_update(const struct ofproto_dpif *ofproto,
ofp_port_t in_port, struct eth_addr dl_src,
int vlan, bool is_grat_arp)
{
struct xbundle *xbundle = NULL;
xbundle = ofp_port_to_xbundle(ofproto, in_port);
if (!xbundle) {
return;
}
ofproto-dpif: APIs and CLI option to add/delete static fdb entry. Currently there is an option to add/flush/show ARP/ND neighbor. This covers L3 side. For L2 side, there is only fdb show command. This commit gives an option to add/del an fdb entry via ovs-appctl. CLI command looks like: To add: ovs-appctl fdb/add <bridge> <port> <vlan> <Mac> ovs-appctl fdb/add br0 p1 0 50:54:00:00:00:05 To del: ovs-appctl fdb/del <bridge> <vlan> <Mac> ovs-appctl fdb/del br0 0 50:54:00:00:00:05 Added two new APIs to provide convenient interface to add and delete static-macs. bool xlate_add_static_mac_entry(const struct ofproto_dpif *, ofp_port_t in_port, struct eth_addr dl_src, int vlan); bool xlate_delete_static_mac_entry(const struct ofproto_dpif *, struct eth_addr dl_src, int vlan); 1. Static entry should not age. To indicate that entry being programmed is a static entry, 'expires' field in 'struct mac_entry' will be set to a MAC_ENTRY_AGE_STATIC_ENTRY. A check for this value is made while deleting mac entry as part of regular aging process. 2. Another change to the mac-update logic, when a packet with same dl_src as that of a static-mac entry arrives on any port, the logic will not modify the expires field. 3. While flushing fdb entries, made sure static ones are not evicted. 4. Updated "ovs-appctl fdb/stats-show br0" to display number of static entries in switch Added following tests: ofproto-dpif - static-mac add/del/flush ofproto-dpif - static-mac mac moves Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2019-June/048894.html Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1597752 Signed-off-by: Vasu Dasari <vdasari@gmail.com> Tested-by: Eelco Chaudron <echaudro@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2021-06-29 16:43:39 -04:00
update_learning_table__(xbundle->xbridge,
xbundle, dl_src, vlan, is_grat_arp);
}
bool
xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto,
ofp_port_t in_port,
struct eth_addr dl_src, int vlan)
{
struct xbundle *xbundle = ofp_port_to_xbundle(ofproto, in_port);
/* Return here if xbundle is NULL. */
if (!xbundle || (xbundle == &ofpp_none_bundle)) {
return false;
}
return mac_learning_add_static_entry(ofproto->ml, dl_src, vlan,
xbundle->ofbundle);
}
bool
xlate_delete_static_mac_entry(const struct ofproto_dpif *ofproto,
struct eth_addr dl_src, int vlan)
{
return mac_learning_del_static_entry(ofproto->ml, dl_src, vlan);
}
void
xlate_set_support(const struct ofproto_dpif *ofproto,
const struct dpif_backer_support *support)
{
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
struct xbridge *xbridge = xbridge_lookup(xcfg, ofproto);
if (xbridge) {
xbridge->support = *support;
}
}