2016-05-17 17:31:33 -07:00
|
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2016 Nicira, Inc.
|
2016-06-02 07:18:49 -03:00
|
|
|
|
* Copyright (c) 2016 Red Hat, Inc.
|
2016-05-17 17:31:33 -07:00
|
|
|
|
*
|
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
|
* You may obtain a copy of the License at:
|
|
|
|
|
*
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
*
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
|
* limitations under the License.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
|
2016-06-02 07:18:47 -03:00
|
|
|
|
#include "netdev-native-tnl.h"
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
#include <errno.h>
|
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
#include <sys/socket.h>
|
|
|
|
|
#include <net/if.h>
|
2017-11-06 14:42:32 -08:00
|
|
|
|
#include <sys/types.h>
|
2016-05-20 05:52:19 +00:00
|
|
|
|
#include <netinet/in.h>
|
2016-06-02 07:18:47 -03:00
|
|
|
|
#include <netinet/ip.h>
|
2016-05-17 17:31:33 -07:00
|
|
|
|
#include <netinet/ip6.h>
|
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <sys/time.h>
|
|
|
|
|
|
|
|
|
|
#include "byte-order.h"
|
2025-03-13 13:43:35 +01:00
|
|
|
|
#include "coverage.h"
|
2016-05-17 17:31:33 -07:00
|
|
|
|
#include "csum.h"
|
|
|
|
|
#include "dp-packet.h"
|
2016-06-02 07:18:47 -03:00
|
|
|
|
#include "netdev.h"
|
2016-05-17 17:31:33 -07:00
|
|
|
|
#include "netdev-vport.h"
|
|
|
|
|
#include "netdev-vport-private.h"
|
|
|
|
|
#include "odp-netlink.h"
|
|
|
|
|
#include "packets.h"
|
2016-06-02 07:18:47 -03:00
|
|
|
|
#include "seq.h"
|
2016-05-17 17:31:33 -07:00
|
|
|
|
#include "unaligned.h"
|
|
|
|
|
#include "unixctl.h"
|
2023-09-03 23:21:54 +08:00
|
|
|
|
#include "util.h"
|
2016-06-02 07:18:47 -03:00
|
|
|
|
#include "openvswitch/vlog.h"
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
|
|
|
|
VLOG_DEFINE_THIS_MODULE(native_tnl);
|
|
|
|
|
static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5);
|
|
|
|
|
|
2025-03-13 13:43:35 +01:00
|
|
|
|
COVERAGE_DEFINE(native_tnl_l3csum_checked);
|
|
|
|
|
COVERAGE_DEFINE(native_tnl_l3csum_err);
|
|
|
|
|
COVERAGE_DEFINE(native_tnl_l4csum_checked);
|
|
|
|
|
COVERAGE_DEFINE(native_tnl_l4csum_err);
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
#define VXLAN_HLEN (sizeof(struct udp_header) + \
|
|
|
|
|
sizeof(struct vxlanhdr))
|
|
|
|
|
|
|
|
|
|
#define GENEVE_BASE_HLEN (sizeof(struct udp_header) + \
|
|
|
|
|
sizeof(struct genevehdr))
|
|
|
|
|
|
2019-11-25 11:19:23 -08:00
|
|
|
|
#define GTPU_HLEN (sizeof(struct udp_header) + \
|
|
|
|
|
sizeof(struct gtpuhdr))
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
uint16_t tnl_udp_port_min = 32768;
|
|
|
|
|
uint16_t tnl_udp_port_max = 61000;
|
|
|
|
|
|
netdev-native-tnl: Fix use of uninitialized RSS hash.
RSS hash calculation for a packet may be skipped in some cases. One
of them is a simple match optimization. Packet is not fully parsed
for the simple match, so there is no enough data to calculate the full
5-tuple hash. However, when such a packet needs tunnel encapsulation,
we need RSS hash to calculate the source port for the outer UDP header.
And netdev_tnl_get_src_port() function doesn't check if the hash is
valid before using it. So, such packets will likely end up with
different and unpredictable source ports potentially causing packet
reordering or other issues in the network:
WARNING: MemorySanitizer: use-of-uninitialized-value
0 0x10c129c in dp_packet_get_rss_hash lib/dp-packet.h:1029:5
1 0x10b264c in netdev_tnl_get_src_port lib/netdev-native-tnl.h:131:12
2 0x10b171a in netdev_tnl_push_udp_header lib/netdev-native-tnl.c:286:20
3 0xb772fe in netdev_push_header lib/netdev.c:1037:13
4 0x9673c4 in push_tnl_action lib/dpif-netdev.c:9067:11
5 0x961abe in dp_execute_cb lib/dpif-netdev.c:9226:13
6 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
7 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
8 0x968f3f in dp_execute_userspace_action lib/dpif-netdev.c:9093:9
9 0x962e54 in dp_execute_cb lib/dpif-netdev.c:9307:17
10 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
11 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
12 0x950fef in packet_batch_per_flow_execute lib/dpif-netdev.c:8271:5
13 0x8ec8db in dp_netdev_input__ lib/dpif-netdev.c:8899:9
14 0x8eb8ec in dp_netdev_input lib/dpif-netdev.c:8908:5
15 0x92d5e8 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5660:19
16 0x8ee2c4 in dpif_netdev_run lib/dpif-netdev.c:6993:25
17 0x9b442f in dpif_run lib/dpif.c:471:16
18 0x5f8e3a in type_run ofproto/ofproto-dpif.c:367:9
19 0x56c508 in ofproto_type_run ofproto/ofproto.c:1879:31
20 0x4cb388 in bridge_run__ vswitchd/bridge.c:3281:9
21 0x4c9b00 in bridge_run vswitchd/bridge.c:3346:5
22 0x526043 in main vswitchd/ovs-vswitchd.c:130:9
23 0x7f1192 in __libc_start_call_main
24 0x7f1192 in __libc_start_main@GLIBC_2.2.5
25 0x432b24 in _start (vswitchd/ovs-vswitchd+0x432b24)
The issue is caught by running the 'debug_slow' test under the memory
sanitizer. Another way to reproduce is by sending two packets at once
through the datapath. The first one will get the same memory chunk as
the upcalled packet with already calculated RSS, the second one will
get the brand new memory chunk without the calculated RSS, so these
two packets will have different source ports after encapsulation.
The test is updated to cover this case.
Fix the issue by checking if the hash is valid before using, re-parsing
and calculating if it is not. The netdev_tnl_get_src_port() function
moved to the .c file, since there is no real reason for it to be in the
header. Compiler can decide on inlining it. The declaration kept in
the header, since all the other functions declared there, even if there
is no reason for that.
In the future we may want to consolidate all the places where we
re-calculate RSS hash into a single function, but it's a little tricky.
This is also a larger change that would be harder to backport. So, not
touching that aspect for now.
Re-parsing the packet eliminates advantages of the simple match, but
it was designed primarily for very simple setups that do not involve
tunneling or any other complex processing, so it should not be a big
problem. And simple match can still be used with tunneling when the
input port provides the RSS hash.
Also, checking if the hash is valid is a right thing to do anyways.
Next step might be to not use simple match when there is no RSS hash
and there is a tunnel push action, but it seems hard to implement,
especially since we don't know the actions until we lookup the flow.
Fixes: e7e9973b80d3 ("dpif-netdev: Forwarding optimization for flows with a simple match.")
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-11-29 17:36:45 +01:00
|
|
|
|
ovs_be16
|
|
|
|
|
netdev_tnl_get_src_port(struct dp_packet *packet)
|
|
|
|
|
{
|
|
|
|
|
uint32_t hash;
|
|
|
|
|
|
|
|
|
|
if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
|
|
|
|
|
hash = dp_packet_get_rss_hash(packet);
|
|
|
|
|
} else {
|
|
|
|
|
struct flow flow;
|
|
|
|
|
|
|
|
|
|
flow_extract(packet, &flow);
|
|
|
|
|
hash = flow_hash_5tuple(&flow, 0);
|
|
|
|
|
|
|
|
|
|
dp_packet_set_rss_hash(packet, hash);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hash = ((uint64_t) hash * (tnl_udp_port_max - tnl_udp_port_min)) >> 32;
|
|
|
|
|
|
|
|
|
|
return htons(hash + tnl_udp_port_min);
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-13 13:43:35 +01:00
|
|
|
|
static void *
|
|
|
|
|
ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
|
2016-05-17 17:31:33 -07:00
|
|
|
|
unsigned int *hlen)
|
|
|
|
|
{
|
|
|
|
|
void *nh;
|
|
|
|
|
struct ip_header *ip;
|
|
|
|
|
struct ovs_16aligned_ip6_hdr *ip6;
|
|
|
|
|
void *l4;
|
|
|
|
|
int l3_size;
|
|
|
|
|
|
|
|
|
|
nh = dp_packet_l3(packet);
|
|
|
|
|
ip = nh;
|
|
|
|
|
ip6 = nh;
|
|
|
|
|
l4 = dp_packet_l4(packet);
|
|
|
|
|
|
|
|
|
|
if (!nh || !l4) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*hlen = sizeof(struct eth_header);
|
|
|
|
|
|
|
|
|
|
l3_size = dp_packet_size(packet) -
|
|
|
|
|
((char *)nh - (char *)dp_packet_data(packet));
|
|
|
|
|
|
|
|
|
|
if (IP_VER(ip->ip_ihl_ver) == 4) {
|
2025-03-13 13:43:35 +01:00
|
|
|
|
bool bad_csum = dp_packet_ip_checksum_bad(packet);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
ovs_be32 ip_src, ip_dst;
|
|
|
|
|
|
2023-06-14 15:03:26 -04:00
|
|
|
|
/* A packet coming from a network device might have the
|
|
|
|
|
* csum already checked. In this case, skip the check. */
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
if (OVS_UNLIKELY(!bad_csum && dp_packet_ip_checksum_unknown(packet))) {
|
2025-03-13 13:43:35 +01:00
|
|
|
|
COVERAGE_INC(native_tnl_l3csum_checked);
|
2025-06-17 09:20:59 +02:00
|
|
|
|
if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) {
|
|
|
|
|
dp_packet_ip_checksum_set_bad(packet);
|
|
|
|
|
bad_csum = true;
|
|
|
|
|
} else {
|
|
|
|
|
dp_packet_ip_checksum_set_good(packet);
|
|
|
|
|
}
|
2025-03-13 13:43:35 +01:00
|
|
|
|
}
|
|
|
|
|
if (OVS_UNLIKELY(bad_csum)) {
|
|
|
|
|
COVERAGE_INC(native_tnl_l3csum_err);
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum");
|
|
|
|
|
return NULL;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ntohs(ip->ip_tot_len) > l3_size) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "ip packet is truncated (IP length %d, actual %d)",
|
|
|
|
|
ntohs(ip->ip_tot_len), l3_size);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
if (IP_IHL(ip->ip_ihl_ver) * 4 > sizeof(struct ip_header)) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "ip options not supported on tunnel packets "
|
|
|
|
|
"(%d bytes)", IP_IHL(ip->ip_ihl_ver) * 4);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ip_src = get_16aligned_be32(&ip->ip_src);
|
|
|
|
|
ip_dst = get_16aligned_be32(&ip->ip_dst);
|
|
|
|
|
|
|
|
|
|
tnl->ip_src = ip_src;
|
|
|
|
|
tnl->ip_dst = ip_dst;
|
|
|
|
|
tnl->ip_tos = ip->ip_tos;
|
|
|
|
|
tnl->ip_ttl = ip->ip_ttl;
|
|
|
|
|
|
|
|
|
|
*hlen += IP_HEADER_LEN;
|
|
|
|
|
|
|
|
|
|
} else if (IP_VER(ip->ip_ihl_ver) == 6) {
|
2016-05-23 20:27:14 -07:00
|
|
|
|
ovs_be32 tc_flow = get_16aligned_be32(&ip6->ip6_flow);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
|
|
|
|
memcpy(tnl->ipv6_src.s6_addr, ip6->ip6_src.be16, sizeof ip6->ip6_src);
|
|
|
|
|
memcpy(tnl->ipv6_dst.s6_addr, ip6->ip6_dst.be16, sizeof ip6->ip6_dst);
|
2016-05-23 20:27:14 -07:00
|
|
|
|
|
|
|
|
|
tnl->ip_tos = ntohl(tc_flow) >> 20;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
tnl->ip_ttl = ip6->ip6_hlim;
|
|
|
|
|
|
2018-03-09 13:02:22 -08:00
|
|
|
|
*hlen += packet->l4_ofs - packet->l3_ofs;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "ipv4 packet has invalid version (%d)",
|
|
|
|
|
IP_VER(ip->ip_ihl_ver));
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return l4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Pushes the 'size' bytes of 'header' into the headroom of 'packet',
|
|
|
|
|
* reallocating the packet if necessary. 'header' should contain an Ethernet
|
|
|
|
|
* header, followed by an IPv4 header (without options), and an L4 header.
|
|
|
|
|
*
|
|
|
|
|
* This function sets the IP header's ip_tot_len field (which should be zeroed
|
|
|
|
|
* as part of 'header') and puts its value into '*ip_tot_size' as well. Also
|
2023-06-14 15:03:26 -04:00
|
|
|
|
* updates IP header checksum if not offloaded, as well as the l3 and l4
|
|
|
|
|
* offsets in the 'packet'.
|
2016-05-17 17:31:33 -07:00
|
|
|
|
*
|
|
|
|
|
* Return pointer to the L4 header added to 'packet'. */
|
|
|
|
|
void *
|
2023-05-23 12:58:21 +09:00
|
|
|
|
netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
|
|
|
|
|
int size, int *ip_tot_size, ovs_be32 ipv6_label)
|
2016-05-17 17:31:33 -07:00
|
|
|
|
{
|
|
|
|
|
struct eth_header *eth;
|
|
|
|
|
struct ip_header *ip;
|
|
|
|
|
struct ovs_16aligned_ip6_hdr *ip6;
|
|
|
|
|
|
|
|
|
|
eth = dp_packet_push_uninit(packet, size);
|
|
|
|
|
*ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header);
|
|
|
|
|
|
|
|
|
|
memcpy(eth, header, size);
|
2017-06-02 16:16:21 +00:00
|
|
|
|
/* The encapsulated packet has type Ethernet. Adjust dp_packet. */
|
|
|
|
|
packet->packet_type = htonl(PT_ETH);
|
|
|
|
|
dp_packet_reset_offsets(packet);
|
|
|
|
|
packet->l3_ofs = sizeof (struct eth_header);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
|
|
|
|
if (netdev_tnl_is_header_ipv6(header)) {
|
|
|
|
|
ip6 = netdev_tnl_ipv6_hdr(eth);
|
|
|
|
|
*ip_tot_size -= IPV6_HEADER_LEN;
|
|
|
|
|
ip6->ip6_plen = htons(*ip_tot_size);
|
2023-05-23 12:58:21 +09:00
|
|
|
|
packet_set_ipv6_flow_label(&ip6->ip6_flow, ipv6_label);
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
dp_packet_ip_checksum_set_unknown(packet);
|
|
|
|
|
|
2017-07-19 14:46:02 +01:00
|
|
|
|
packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size;
|
2024-01-17 14:26:30 -05:00
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
return ip6 + 1;
|
|
|
|
|
} else {
|
|
|
|
|
ip = netdev_tnl_ip_hdr(eth);
|
|
|
|
|
ip->ip_tot_len = htons(*ip_tot_size);
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
*ip_tot_size -= IP_HEADER_LEN;
|
2023-06-14 15:03:26 -04:00
|
|
|
|
/* Postpone checksum to when the packet is pushed to the port. */
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
dp_packet_ip_checksum_set_partial(packet);
|
2024-01-17 14:26:30 -05:00
|
|
|
|
|
2017-07-19 14:46:02 +01:00
|
|
|
|
packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size;
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
return ip + 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void *
|
|
|
|
|
udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
|
|
|
|
|
unsigned int *hlen)
|
|
|
|
|
{
|
|
|
|
|
struct udp_header *udp;
|
|
|
|
|
|
2025-03-13 13:43:35 +01:00
|
|
|
|
udp = ip_extract_tnl_md(packet, tnl, hlen);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
if (!udp) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (udp->udp_csum) {
|
2025-03-13 13:43:35 +01:00
|
|
|
|
bool bad_csum = dp_packet_l4_checksum_bad(packet);
|
|
|
|
|
|
2025-06-17 09:20:58 +02:00
|
|
|
|
if (OVS_UNLIKELY(!bad_csum && dp_packet_l4_checksum_unknown(packet))) {
|
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports.
Add Rx checksum offloading feature support on DPDK physical ports. By default,
the Rx checksum offloading is enabled if NIC supports. However,
the checksum offloading can be turned OFF either while adding a new DPDK
physical port to OVS or at runtime.
The rx checksum offloading can be turned off by setting the parameter to
'false'. For eg: To disable the rx checksum offloading when adding a port,
'ovs-vsctl add-port br0 dpdk0 -- \
set Interface dpdk0 type=dpdk options:rx-checksum-offload=false'
OR (to disable at run time after port is being added to OVS)
'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false'
Similarly to turn ON rx checksum offloading at run time,
'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true'
The Tx checksum offloading support is not implemented due to the following
reasons.
1) Checksum offloading and vectorization are mutually exclusive in DPDK poll
mode driver. Vector packet processing is turned OFF when checksum offloading
is enabled which causes significant performance drop at Tx side.
2) Normally, OVS generates checksum for tunnel packets in software at the
'tunnel push' operation, where the tunnel headers are created. However
enabling Tx checksum offloading involves,
*) Mark every packets for tx checksum offloading at 'tunnel_push' and
recirculate.
*) At the time of xmit, validate the same flag and instruct the NIC to do the
checksum calculation. In case NIC doesnt support Tx checksum offloading,
the checksum calculation has to be done in software before sending out the
packets.
No significant performance improvement noticed with Tx checksum offloading
due to the e overhead of additional validations + non vector packet processing.
In some test scenarios, it introduces performance drop too.
Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling
decapsulation even though the SSE vector Rx function is disabled in DPDK poll
mode driver.
Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
Acked-by: Jesse Gross <jesse@kernel.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
|
|
|
|
uint32_t csum;
|
2025-03-13 13:43:35 +01:00
|
|
|
|
COVERAGE_INC(native_tnl_l4csum_checked);
|
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports.
Add Rx checksum offloading feature support on DPDK physical ports. By default,
the Rx checksum offloading is enabled if NIC supports. However,
the checksum offloading can be turned OFF either while adding a new DPDK
physical port to OVS or at runtime.
The rx checksum offloading can be turned off by setting the parameter to
'false'. For eg: To disable the rx checksum offloading when adding a port,
'ovs-vsctl add-port br0 dpdk0 -- \
set Interface dpdk0 type=dpdk options:rx-checksum-offload=false'
OR (to disable at run time after port is being added to OVS)
'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false'
Similarly to turn ON rx checksum offloading at run time,
'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true'
The Tx checksum offloading support is not implemented due to the following
reasons.
1) Checksum offloading and vectorization are mutually exclusive in DPDK poll
mode driver. Vector packet processing is turned OFF when checksum offloading
is enabled which causes significant performance drop at Tx side.
2) Normally, OVS generates checksum for tunnel packets in software at the
'tunnel push' operation, where the tunnel headers are created. However
enabling Tx checksum offloading involves,
*) Mark every packets for tx checksum offloading at 'tunnel_push' and
recirculate.
*) At the time of xmit, validate the same flag and instruct the NIC to do the
checksum calculation. In case NIC doesnt support Tx checksum offloading,
the checksum calculation has to be done in software before sending out the
packets.
No significant performance improvement noticed with Tx checksum offloading
due to the e overhead of additional validations + non vector packet processing.
In some test scenarios, it introduces performance drop too.
Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling
decapsulation even though the SSE vector Rx function is disabled in DPDK poll
mode driver.
Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
Acked-by: Jesse Gross <jesse@kernel.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
|
|
|
|
if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) {
|
|
|
|
|
csum = packet_csum_pseudoheader6(dp_packet_l3(packet));
|
|
|
|
|
} else {
|
|
|
|
|
csum = packet_csum_pseudoheader(dp_packet_l3(packet));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
csum = csum_continue(csum, udp, dp_packet_size(packet) -
|
|
|
|
|
((const unsigned char *)udp -
|
2017-04-25 16:29:59 +00:00
|
|
|
|
(const unsigned char *)dp_packet_eth(packet)
|
netdev-dpdk: Enable Rx checksum offloading feature on DPDK physical ports.
Add Rx checksum offloading feature support on DPDK physical ports. By default,
the Rx checksum offloading is enabled if NIC supports. However,
the checksum offloading can be turned OFF either while adding a new DPDK
physical port to OVS or at runtime.
The rx checksum offloading can be turned off by setting the parameter to
'false'. For eg: To disable the rx checksum offloading when adding a port,
'ovs-vsctl add-port br0 dpdk0 -- \
set Interface dpdk0 type=dpdk options:rx-checksum-offload=false'
OR (to disable at run time after port is being added to OVS)
'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false'
Similarly to turn ON rx checksum offloading at run time,
'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true'
The Tx checksum offloading support is not implemented due to the following
reasons.
1) Checksum offloading and vectorization are mutually exclusive in DPDK poll
mode driver. Vector packet processing is turned OFF when checksum offloading
is enabled which causes significant performance drop at Tx side.
2) Normally, OVS generates checksum for tunnel packets in software at the
'tunnel push' operation, where the tunnel headers are created. However
enabling Tx checksum offloading involves,
*) Mark every packets for tx checksum offloading at 'tunnel_push' and
recirculate.
*) At the time of xmit, validate the same flag and instruct the NIC to do the
checksum calculation. In case NIC doesnt support Tx checksum offloading,
the checksum calculation has to be done in software before sending out the
packets.
No significant performance improvement noticed with Tx checksum offloading
due to the e overhead of additional validations + non vector packet processing.
In some test scenarios, it introduces performance drop too.
Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling
decapsulation even though the SSE vector Rx function is disabled in DPDK poll
mode driver.
Signed-off-by: Sugesh Chandran <sugesh.chandran@intel.com>
Acked-by: Jesse Gross <jesse@kernel.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
2017-01-02 14:27:48 -08:00
|
|
|
|
));
|
2025-06-17 09:20:59 +02:00
|
|
|
|
if (csum_finish(csum)) {
|
|
|
|
|
dp_packet_l4_checksum_set_bad(packet);
|
|
|
|
|
bad_csum = true;
|
|
|
|
|
} else {
|
|
|
|
|
dp_packet_l4_checksum_set_good(packet);
|
|
|
|
|
}
|
2025-03-13 13:43:35 +01:00
|
|
|
|
}
|
|
|
|
|
if (OVS_UNLIKELY(bad_csum)) {
|
|
|
|
|
COVERAGE_INC(native_tnl_l4csum_err);
|
|
|
|
|
return NULL;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
tnl->flags |= FLOW_TNL_F_CSUM;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tnl->tp_src = udp->udp_src;
|
|
|
|
|
tnl->tp_dst = udp->udp_dst;
|
|
|
|
|
|
|
|
|
|
return udp + 1;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-17 14:26:30 -05:00
|
|
|
|
static void
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_push(struct dp_packet *packet,
|
|
|
|
|
const struct ovs_action_push_tnl *data)
|
2024-01-17 14:26:30 -05:00
|
|
|
|
{
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
packet->offloads <<= DP_PACKET_OL_SHIFT_COUNT;
|
2024-01-17 14:26:30 -05:00
|
|
|
|
|
2024-05-30 15:10:14 +02:00
|
|
|
|
if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) {
|
2025-06-17 09:20:56 +02:00
|
|
|
|
dp_packet_tunnel_set_geneve(packet);
|
2024-05-30 15:10:14 +02:00
|
|
|
|
} else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
|
2025-06-17 09:20:56 +02:00
|
|
|
|
dp_packet_tunnel_set_vxlan(packet);
|
2025-01-16 00:21:31 -05:00
|
|
|
|
} else if (data->tnl_type == OVS_VPORT_TYPE_GRE ||
|
|
|
|
|
data->tnl_type == OVS_VPORT_TYPE_IP6GRE) {
|
2025-06-17 09:20:56 +02:00
|
|
|
|
dp_packet_tunnel_set_gre(packet);
|
2024-01-17 14:26:30 -05:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
static void
|
|
|
|
|
tnl_ol_pop(struct dp_packet *packet, int off)
|
|
|
|
|
{
|
|
|
|
|
packet->offloads >>= DP_PACKET_OL_SHIFT_COUNT;
|
|
|
|
|
|
|
|
|
|
dp_packet_reset_packet(packet, off);
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
void
|
2018-03-09 13:02:23 -08:00
|
|
|
|
netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED,
|
|
|
|
|
struct dp_packet *packet,
|
2016-05-17 17:31:33 -07:00
|
|
|
|
const struct ovs_action_push_tnl *data)
|
|
|
|
|
{
|
2024-01-17 14:26:31 -05:00
|
|
|
|
uint16_t l3_ofs = packet->l3_ofs;
|
|
|
|
|
uint16_t l4_ofs = packet->l4_ofs;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
struct udp_header *udp;
|
netdev-native-tnl: Fix use of uninitialized RSS hash.
RSS hash calculation for a packet may be skipped in some cases. One
of them is a simple match optimization. Packet is not fully parsed
for the simple match, so there is no enough data to calculate the full
5-tuple hash. However, when such a packet needs tunnel encapsulation,
we need RSS hash to calculate the source port for the outer UDP header.
And netdev_tnl_get_src_port() function doesn't check if the hash is
valid before using it. So, such packets will likely end up with
different and unpredictable source ports potentially causing packet
reordering or other issues in the network:
WARNING: MemorySanitizer: use-of-uninitialized-value
0 0x10c129c in dp_packet_get_rss_hash lib/dp-packet.h:1029:5
1 0x10b264c in netdev_tnl_get_src_port lib/netdev-native-tnl.h:131:12
2 0x10b171a in netdev_tnl_push_udp_header lib/netdev-native-tnl.c:286:20
3 0xb772fe in netdev_push_header lib/netdev.c:1037:13
4 0x9673c4 in push_tnl_action lib/dpif-netdev.c:9067:11
5 0x961abe in dp_execute_cb lib/dpif-netdev.c:9226:13
6 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
7 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
8 0x968f3f in dp_execute_userspace_action lib/dpif-netdev.c:9093:9
9 0x962e54 in dp_execute_cb lib/dpif-netdev.c:9307:17
10 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
11 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
12 0x950fef in packet_batch_per_flow_execute lib/dpif-netdev.c:8271:5
13 0x8ec8db in dp_netdev_input__ lib/dpif-netdev.c:8899:9
14 0x8eb8ec in dp_netdev_input lib/dpif-netdev.c:8908:5
15 0x92d5e8 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5660:19
16 0x8ee2c4 in dpif_netdev_run lib/dpif-netdev.c:6993:25
17 0x9b442f in dpif_run lib/dpif.c:471:16
18 0x5f8e3a in type_run ofproto/ofproto-dpif.c:367:9
19 0x56c508 in ofproto_type_run ofproto/ofproto.c:1879:31
20 0x4cb388 in bridge_run__ vswitchd/bridge.c:3281:9
21 0x4c9b00 in bridge_run vswitchd/bridge.c:3346:5
22 0x526043 in main vswitchd/ovs-vswitchd.c:130:9
23 0x7f1192 in __libc_start_call_main
24 0x7f1192 in __libc_start_main@GLIBC_2.2.5
25 0x432b24 in _start (vswitchd/ovs-vswitchd+0x432b24)
The issue is caught by running the 'debug_slow' test under the memory
sanitizer. Another way to reproduce is by sending two packets at once
through the datapath. The first one will get the same memory chunk as
the upcalled packet with already calculated RSS, the second one will
get the brand new memory chunk without the calculated RSS, so these
two packets will have different source ports after encapsulation.
The test is updated to cover this case.
Fix the issue by checking if the hash is valid before using, re-parsing
and calculating if it is not. The netdev_tnl_get_src_port() function
moved to the .c file, since there is no real reason for it to be in the
header. Compiler can decide on inlining it. The declaration kept in
the header, since all the other functions declared there, even if there
is no reason for that.
In the future we may want to consolidate all the places where we
re-calculate RSS hash into a single function, but it's a little tricky.
This is also a larger change that would be harder to backport. So, not
touching that aspect for now.
Re-parsing the packet eliminates advantages of the simple match, but
it was designed primarily for very simple setups that do not involve
tunneling or any other complex processing, so it should not be a big
problem. And simple match can still be used with tunneling when the
input port provides the RSS hash.
Also, checking if the hash is valid is a right thing to do anyways.
Next step might be to not use simple match when there is no RSS hash
and there is a tunnel push action, but it seems hard to implement,
especially since we don't know the actions until we lookup the flow.
Fixes: e7e9973b80d3 ("dpif-netdev: Forwarding optimization for flows with a simple match.")
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-11-29 17:36:45 +01:00
|
|
|
|
ovs_be16 udp_src;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
int ip_tot_size;
|
|
|
|
|
|
netdev-native-tnl: Fix use of uninitialized RSS hash.
RSS hash calculation for a packet may be skipped in some cases. One
of them is a simple match optimization. Packet is not fully parsed
for the simple match, so there is no enough data to calculate the full
5-tuple hash. However, when such a packet needs tunnel encapsulation,
we need RSS hash to calculate the source port for the outer UDP header.
And netdev_tnl_get_src_port() function doesn't check if the hash is
valid before using it. So, such packets will likely end up with
different and unpredictable source ports potentially causing packet
reordering or other issues in the network:
WARNING: MemorySanitizer: use-of-uninitialized-value
0 0x10c129c in dp_packet_get_rss_hash lib/dp-packet.h:1029:5
1 0x10b264c in netdev_tnl_get_src_port lib/netdev-native-tnl.h:131:12
2 0x10b171a in netdev_tnl_push_udp_header lib/netdev-native-tnl.c:286:20
3 0xb772fe in netdev_push_header lib/netdev.c:1037:13
4 0x9673c4 in push_tnl_action lib/dpif-netdev.c:9067:11
5 0x961abe in dp_execute_cb lib/dpif-netdev.c:9226:13
6 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
7 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
8 0x968f3f in dp_execute_userspace_action lib/dpif-netdev.c:9093:9
9 0x962e54 in dp_execute_cb lib/dpif-netdev.c:9307:17
10 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
11 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
12 0x950fef in packet_batch_per_flow_execute lib/dpif-netdev.c:8271:5
13 0x8ec8db in dp_netdev_input__ lib/dpif-netdev.c:8899:9
14 0x8eb8ec in dp_netdev_input lib/dpif-netdev.c:8908:5
15 0x92d5e8 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5660:19
16 0x8ee2c4 in dpif_netdev_run lib/dpif-netdev.c:6993:25
17 0x9b442f in dpif_run lib/dpif.c:471:16
18 0x5f8e3a in type_run ofproto/ofproto-dpif.c:367:9
19 0x56c508 in ofproto_type_run ofproto/ofproto.c:1879:31
20 0x4cb388 in bridge_run__ vswitchd/bridge.c:3281:9
21 0x4c9b00 in bridge_run vswitchd/bridge.c:3346:5
22 0x526043 in main vswitchd/ovs-vswitchd.c:130:9
23 0x7f1192 in __libc_start_call_main
24 0x7f1192 in __libc_start_main@GLIBC_2.2.5
25 0x432b24 in _start (vswitchd/ovs-vswitchd+0x432b24)
The issue is caught by running the 'debug_slow' test under the memory
sanitizer. Another way to reproduce is by sending two packets at once
through the datapath. The first one will get the same memory chunk as
the upcalled packet with already calculated RSS, the second one will
get the brand new memory chunk without the calculated RSS, so these
two packets will have different source ports after encapsulation.
The test is updated to cover this case.
Fix the issue by checking if the hash is valid before using, re-parsing
and calculating if it is not. The netdev_tnl_get_src_port() function
moved to the .c file, since there is no real reason for it to be in the
header. Compiler can decide on inlining it. The declaration kept in
the header, since all the other functions declared there, even if there
is no reason for that.
In the future we may want to consolidate all the places where we
re-calculate RSS hash into a single function, but it's a little tricky.
This is also a larger change that would be harder to backport. So, not
touching that aspect for now.
Re-parsing the packet eliminates advantages of the simple match, but
it was designed primarily for very simple setups that do not involve
tunneling or any other complex processing, so it should not be a big
problem. And simple match can still be used with tunneling when the
input port provides the RSS hash.
Also, checking if the hash is valid is a right thing to do anyways.
Next step might be to not use simple match when there is no RSS hash
and there is a tunnel push action, but it seems hard to implement,
especially since we don't know the actions until we lookup the flow.
Fixes: e7e9973b80d3 ("dpif-netdev: Forwarding optimization for flows with a simple match.")
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-11-29 17:36:45 +01:00
|
|
|
|
/* We may need to re-calculate the hash and this has to be done before
|
|
|
|
|
* modifying the packet. */
|
|
|
|
|
udp_src = netdev_tnl_get_src_port(packet);
|
|
|
|
|
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_push(packet, data);
|
2023-05-23 12:58:21 +09:00
|
|
|
|
udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
|
|
|
|
|
&ip_tot_size, 0);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
netdev-native-tnl: Fix use of uninitialized RSS hash.
RSS hash calculation for a packet may be skipped in some cases. One
of them is a simple match optimization. Packet is not fully parsed
for the simple match, so there is no enough data to calculate the full
5-tuple hash. However, when such a packet needs tunnel encapsulation,
we need RSS hash to calculate the source port for the outer UDP header.
And netdev_tnl_get_src_port() function doesn't check if the hash is
valid before using it. So, such packets will likely end up with
different and unpredictable source ports potentially causing packet
reordering or other issues in the network:
WARNING: MemorySanitizer: use-of-uninitialized-value
0 0x10c129c in dp_packet_get_rss_hash lib/dp-packet.h:1029:5
1 0x10b264c in netdev_tnl_get_src_port lib/netdev-native-tnl.h:131:12
2 0x10b171a in netdev_tnl_push_udp_header lib/netdev-native-tnl.c:286:20
3 0xb772fe in netdev_push_header lib/netdev.c:1037:13
4 0x9673c4 in push_tnl_action lib/dpif-netdev.c:9067:11
5 0x961abe in dp_execute_cb lib/dpif-netdev.c:9226:13
6 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
7 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
8 0x968f3f in dp_execute_userspace_action lib/dpif-netdev.c:9093:9
9 0x962e54 in dp_execute_cb lib/dpif-netdev.c:9307:17
10 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
11 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
12 0x950fef in packet_batch_per_flow_execute lib/dpif-netdev.c:8271:5
13 0x8ec8db in dp_netdev_input__ lib/dpif-netdev.c:8899:9
14 0x8eb8ec in dp_netdev_input lib/dpif-netdev.c:8908:5
15 0x92d5e8 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5660:19
16 0x8ee2c4 in dpif_netdev_run lib/dpif-netdev.c:6993:25
17 0x9b442f in dpif_run lib/dpif.c:471:16
18 0x5f8e3a in type_run ofproto/ofproto-dpif.c:367:9
19 0x56c508 in ofproto_type_run ofproto/ofproto.c:1879:31
20 0x4cb388 in bridge_run__ vswitchd/bridge.c:3281:9
21 0x4c9b00 in bridge_run vswitchd/bridge.c:3346:5
22 0x526043 in main vswitchd/ovs-vswitchd.c:130:9
23 0x7f1192 in __libc_start_call_main
24 0x7f1192 in __libc_start_main@GLIBC_2.2.5
25 0x432b24 in _start (vswitchd/ovs-vswitchd+0x432b24)
The issue is caught by running the 'debug_slow' test under the memory
sanitizer. Another way to reproduce is by sending two packets at once
through the datapath. The first one will get the same memory chunk as
the upcalled packet with already calculated RSS, the second one will
get the brand new memory chunk without the calculated RSS, so these
two packets will have different source ports after encapsulation.
The test is updated to cover this case.
Fix the issue by checking if the hash is valid before using, re-parsing
and calculating if it is not. The netdev_tnl_get_src_port() function
moved to the .c file, since there is no real reason for it to be in the
header. Compiler can decide on inlining it. The declaration kept in
the header, since all the other functions declared there, even if there
is no reason for that.
In the future we may want to consolidate all the places where we
re-calculate RSS hash into a single function, but it's a little tricky.
This is also a larger change that would be harder to backport. So, not
touching that aspect for now.
Re-parsing the packet eliminates advantages of the simple match, but
it was designed primarily for very simple setups that do not involve
tunneling or any other complex processing, so it should not be a big
problem. And simple match can still be used with tunneling when the
input port provides the RSS hash.
Also, checking if the hash is valid is a right thing to do anyways.
Next step might be to not use simple match when there is no RSS hash
and there is a tunnel push action, but it seems hard to implement,
especially since we don't know the actions until we lookup the flow.
Fixes: e7e9973b80d3 ("dpif-netdev: Forwarding optimization for flows with a simple match.")
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-11-29 17:36:45 +01:00
|
|
|
|
udp->udp_src = udp_src;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
udp->udp_len = htons(ip_tot_size);
|
|
|
|
|
|
2025-06-17 09:20:58 +02:00
|
|
|
|
dp_packet_l4_proto_set_udp(packet);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
if (udp->udp_csum) {
|
2025-06-17 09:20:58 +02:00
|
|
|
|
dp_packet_l4_checksum_set_partial(packet);
|
2025-06-17 09:20:52 +02:00
|
|
|
|
} else {
|
2025-06-17 09:20:58 +02:00
|
|
|
|
dp_packet_l4_checksum_set_good(packet);
|
2024-01-17 14:26:31 -05:00
|
|
|
|
}
|
2024-01-17 14:26:30 -05:00
|
|
|
|
|
2024-01-17 14:26:31 -05:00
|
|
|
|
if (l3_ofs != UINT16_MAX) {
|
|
|
|
|
packet->inner_l3_ofs = l3_ofs + data->header_len;
|
|
|
|
|
}
|
|
|
|
|
if (l4_ofs != UINT16_MAX) {
|
|
|
|
|
packet->inner_l4_ofs = l4_ofs + data->header_len;
|
|
|
|
|
}
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void *
|
2016-05-23 20:27:14 -07:00
|
|
|
|
eth_build_header(struct ovs_action_push_tnl *data,
|
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
2016-05-17 17:31:33 -07:00
|
|
|
|
{
|
2016-05-23 20:27:14 -07:00
|
|
|
|
uint16_t eth_proto = params->is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP;
|
|
|
|
|
struct eth_header *eth;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
memset(data->header, 0, sizeof data->header);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
eth = (struct eth_header *)data->header;
|
|
|
|
|
eth->eth_dst = params->dmac;
|
|
|
|
|
eth->eth_src = params->smac;
|
|
|
|
|
eth->eth_type = htons(eth_proto);
|
|
|
|
|
data->header_len = sizeof(struct eth_header);
|
|
|
|
|
return eth + 1;
|
|
|
|
|
}
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
void *
|
|
|
|
|
netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data,
|
|
|
|
|
const struct netdev_tnl_build_header_params *params,
|
2023-05-23 12:58:22 +09:00
|
|
|
|
uint8_t next_proto, ovs_be32 ipv6_label)
|
2016-05-23 20:27:14 -07:00
|
|
|
|
{
|
|
|
|
|
void *l3;
|
|
|
|
|
|
|
|
|
|
l3 = eth_build_header(data, params);
|
|
|
|
|
if (!params->is_ipv6) {
|
|
|
|
|
ovs_be32 ip_src = in6_addr_get_mapped_ipv4(params->s_ip);
|
|
|
|
|
struct ip_header *ip;
|
|
|
|
|
|
|
|
|
|
ip = (struct ip_header *) l3;
|
|
|
|
|
|
|
|
|
|
ip->ip_ihl_ver = IP_IHL_VER(5, 4);
|
|
|
|
|
ip->ip_tos = params->flow->tunnel.ip_tos;
|
|
|
|
|
ip->ip_ttl = params->flow->tunnel.ip_ttl;
|
|
|
|
|
ip->ip_proto = next_proto;
|
|
|
|
|
put_16aligned_be32(&ip->ip_src, ip_src);
|
|
|
|
|
put_16aligned_be32(&ip->ip_dst, params->flow->tunnel.ip_dst);
|
|
|
|
|
|
|
|
|
|
ip->ip_frag_off = (params->flow->tunnel.flags & FLOW_TNL_F_DONT_FRAGMENT) ?
|
|
|
|
|
htons(IP_DF) : 0;
|
|
|
|
|
|
2023-06-14 15:03:26 -04:00
|
|
|
|
/* The checksum will be calculated when the headers are pushed
|
|
|
|
|
* to the packet if offloading is not enabled. */
|
2016-05-23 20:27:14 -07:00
|
|
|
|
|
|
|
|
|
data->header_len += IP_HEADER_LEN;
|
|
|
|
|
return ip + 1;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
} else {
|
2016-05-23 20:27:14 -07:00
|
|
|
|
struct ovs_16aligned_ip6_hdr *ip6;
|
|
|
|
|
|
|
|
|
|
ip6 = (struct ovs_16aligned_ip6_hdr *) l3;
|
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
put_16aligned_be32(&ip6->ip6_flow, htonl(6 << 28) |
|
2023-05-23 12:58:22 +09:00
|
|
|
|
htonl(params->flow->tunnel.ip_tos << 20) |
|
|
|
|
|
(ipv6_label & htonl(IPV6_LABEL_MASK)));
|
2016-05-23 20:27:14 -07:00
|
|
|
|
ip6->ip6_hlim = params->flow->tunnel.ip_ttl;
|
|
|
|
|
ip6->ip6_nxt = next_proto;
|
|
|
|
|
memcpy(&ip6->ip6_src, params->s_ip, sizeof(ovs_be32[4]));
|
|
|
|
|
memcpy(&ip6->ip6_dst, ¶ms->flow->tunnel.ipv6_dst, sizeof(ovs_be32[4]));
|
|
|
|
|
|
|
|
|
|
data->header_len += IPV6_HEADER_LEN;
|
|
|
|
|
return ip6 + 1;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
2016-05-23 20:27:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void *
|
2023-05-20 01:35:26 +02:00
|
|
|
|
udp_build_header(const struct netdev_tunnel_config *tnl_cfg,
|
2016-05-23 20:27:14 -07:00
|
|
|
|
struct ovs_action_push_tnl *data,
|
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
|
|
|
|
{
|
|
|
|
|
struct udp_header *udp;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2023-05-23 12:58:22 +09:00
|
|
|
|
udp = netdev_tnl_ip_build_header(data, params, IPPROTO_UDP, 0);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
udp->udp_dst = tnl_cfg->dst_port;
|
|
|
|
|
|
2024-07-05 16:45:01 -04:00
|
|
|
|
if (params->flow->tunnel.flags & FLOW_TNL_F_CSUM) {
|
2016-05-17 17:31:33 -07:00
|
|
|
|
/* Write a value in now to mark that we should compute the checksum
|
|
|
|
|
* later. 0xffff is handy because it is transparent to the
|
|
|
|
|
* calculation. */
|
|
|
|
|
udp->udp_csum = htons(0xffff);
|
|
|
|
|
}
|
2016-05-23 20:27:14 -07:00
|
|
|
|
data->header_len += sizeof *udp;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
return udp + 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
gre_header_len(ovs_be16 flags)
|
|
|
|
|
{
|
|
|
|
|
int hlen = 4;
|
|
|
|
|
|
|
|
|
|
if (flags & htons(GRE_CSUM)) {
|
|
|
|
|
hlen += 4;
|
|
|
|
|
}
|
|
|
|
|
if (flags & htons(GRE_KEY)) {
|
|
|
|
|
hlen += 4;
|
|
|
|
|
}
|
|
|
|
|
if (flags & htons(GRE_SEQ)) {
|
|
|
|
|
hlen += 4;
|
|
|
|
|
}
|
|
|
|
|
return hlen;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
parse_gre_header(struct dp_packet *packet,
|
|
|
|
|
struct flow_tnl *tnl)
|
|
|
|
|
{
|
|
|
|
|
const struct gre_base_hdr *greh;
|
|
|
|
|
ovs_16aligned_be32 *options;
|
|
|
|
|
int hlen;
|
|
|
|
|
unsigned int ulen;
|
2017-06-02 16:16:21 +00:00
|
|
|
|
uint16_t greh_protocol;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2025-03-13 13:43:35 +01:00
|
|
|
|
greh = ip_extract_tnl_md(packet, tnl, &ulen);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
if (!greh) {
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (greh->flags & ~(htons(GRE_CSUM | GRE_KEY | GRE_SEQ))) {
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hlen = ulen + gre_header_len(greh->flags);
|
|
|
|
|
if (hlen > dp_packet_size(packet)) {
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
options = (ovs_16aligned_be32 *)(greh + 1);
|
|
|
|
|
if (greh->flags & htons(GRE_CSUM)) {
|
|
|
|
|
ovs_be16 pkt_csum;
|
|
|
|
|
|
|
|
|
|
pkt_csum = csum(greh, dp_packet_size(packet) -
|
|
|
|
|
((const unsigned char *)greh -
|
2017-04-25 16:29:59 +00:00
|
|
|
|
(const unsigned char *)dp_packet_eth(packet)));
|
2016-05-17 17:31:33 -07:00
|
|
|
|
if (pkt_csum) {
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
tnl->flags = FLOW_TNL_F_CSUM;
|
|
|
|
|
options++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (greh->flags & htons(GRE_KEY)) {
|
2016-05-26 16:53:52 -07:00
|
|
|
|
tnl->tun_id = be32_to_be64(get_16aligned_be32(options));
|
2016-05-17 17:31:33 -07:00
|
|
|
|
tnl->flags |= FLOW_TNL_F_KEY;
|
|
|
|
|
options++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (greh->flags & htons(GRE_SEQ)) {
|
|
|
|
|
options++;
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-02 16:16:21 +00:00
|
|
|
|
/* Set the new packet type depending on the GRE protocol field. */
|
|
|
|
|
greh_protocol = ntohs(greh->protocol);
|
|
|
|
|
if (greh_protocol == ETH_TYPE_TEB) {
|
|
|
|
|
packet->packet_type = htonl(PT_ETH);
|
|
|
|
|
} else if (greh_protocol >= ETH_TYPE_MIN) {
|
|
|
|
|
/* Allow all GRE protocol values above 0x5ff as Ethertypes. */
|
|
|
|
|
packet->packet_type = PACKET_TYPE_BE(OFPHTN_ETHERTYPE, greh_protocol);
|
|
|
|
|
} else {
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
return hlen;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 17:32:06 -07:00
|
|
|
|
struct dp_packet *
|
2016-05-17 17:31:33 -07:00
|
|
|
|
netdev_gre_pop_header(struct dp_packet *packet)
|
|
|
|
|
{
|
2023-09-03 23:21:54 +08:00
|
|
|
|
const void *data_dp = dp_packet_data(packet);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
struct pkt_metadata *md = &packet->md;
|
|
|
|
|
struct flow_tnl *tnl = &md->tunnel;
|
|
|
|
|
int hlen = sizeof(struct eth_header) + 4;
|
|
|
|
|
|
2023-09-03 23:21:54 +08:00
|
|
|
|
ovs_assert(data_dp);
|
|
|
|
|
|
|
|
|
|
hlen += netdev_tnl_is_header_ipv6(data_dp) ?
|
2016-05-17 17:31:33 -07:00
|
|
|
|
IPV6_HEADER_LEN : IP_HEADER_LEN;
|
|
|
|
|
|
|
|
|
|
pkt_metadata_init_tnl(md);
|
|
|
|
|
if (hlen > dp_packet_size(packet)) {
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hlen = parse_gre_header(packet, tnl);
|
|
|
|
|
if (hlen < 0) {
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_pop(packet, hlen);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-17 17:32:06 -07:00
|
|
|
|
return packet;
|
|
|
|
|
err:
|
|
|
|
|
dp_packet_delete(packet);
|
|
|
|
|
return NULL;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2018-05-15 16:10:49 -04:00
|
|
|
|
netdev_gre_push_header(const struct netdev *netdev,
|
2018-03-09 13:02:23 -08:00
|
|
|
|
struct dp_packet *packet,
|
2016-05-17 17:31:33 -07:00
|
|
|
|
const struct ovs_action_push_tnl *data)
|
|
|
|
|
{
|
2018-05-15 16:10:49 -04:00
|
|
|
|
struct netdev_vport *dev = netdev_vport_cast(netdev);
|
2025-01-16 00:21:31 -05:00
|
|
|
|
uint16_t l3_ofs = packet->l3_ofs;
|
|
|
|
|
uint16_t l4_ofs = packet->l4_ofs;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
struct gre_base_hdr *greh;
|
|
|
|
|
int ip_tot_size;
|
|
|
|
|
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_push(packet, data);
|
2025-01-16 00:21:31 -05:00
|
|
|
|
|
2023-05-23 12:58:21 +09:00
|
|
|
|
greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
|
|
|
|
|
&ip_tot_size, 0);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
|
|
|
|
if (greh->flags & htons(GRE_CSUM)) {
|
|
|
|
|
ovs_be16 *csum_opt = (ovs_be16 *) (greh + 1);
|
|
|
|
|
*csum_opt = csum(greh, ip_tot_size);
|
|
|
|
|
}
|
2018-05-15 16:10:49 -04:00
|
|
|
|
|
|
|
|
|
if (greh->flags & htons(GRE_SEQ)) {
|
2025-06-17 09:21:00 +02:00
|
|
|
|
if (!dp_packet_get_tso_segsz(packet)) {
|
2025-01-16 00:21:31 -05:00
|
|
|
|
/* Last 4 bytes are GRE seqno. */
|
|
|
|
|
int seq_ofs = gre_header_len(greh->flags) - 4;
|
|
|
|
|
ovs_16aligned_be32 *seq_opt =
|
|
|
|
|
ALIGNED_CAST(ovs_16aligned_be32 *, (char *) greh + seq_ofs);
|
|
|
|
|
|
|
|
|
|
put_16aligned_be32(seq_opt,
|
|
|
|
|
htonl(atomic_count_inc(&dev->gre_seqno)));
|
|
|
|
|
} else {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "Cannot use GRE Sequence numbers with TSO.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (l3_ofs != UINT16_MAX) {
|
|
|
|
|
packet->inner_l3_ofs = l3_ofs + data->header_len;
|
|
|
|
|
}
|
|
|
|
|
if (l4_ofs != UINT16_MAX) {
|
|
|
|
|
packet->inner_l4_ofs = l4_ofs + data->header_len;
|
2018-05-15 16:10:49 -04:00
|
|
|
|
}
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
netdev_gre_build_header(const struct netdev *netdev,
|
|
|
|
|
struct ovs_action_push_tnl *data,
|
2016-05-23 20:27:14 -07:00
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
2016-05-17 17:31:33 -07:00
|
|
|
|
{
|
2023-05-20 01:35:26 +02:00
|
|
|
|
const struct netdev_tunnel_config *tnl_cfg;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
struct gre_base_hdr *greh;
|
|
|
|
|
ovs_16aligned_be32 *options;
|
2016-05-23 20:27:14 -07:00
|
|
|
|
unsigned int hlen;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2023-05-23 12:58:22 +09:00
|
|
|
|
greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE, 0);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2017-06-23 16:47:59 +00:00
|
|
|
|
if (params->flow->packet_type == htonl(PT_ETH)) {
|
2017-06-02 16:16:21 +00:00
|
|
|
|
greh->protocol = htons(ETH_TYPE_TEB);
|
2017-06-23 16:47:59 +00:00
|
|
|
|
} else if (pt_ns(params->flow->packet_type) == OFPHTN_ETHERTYPE) {
|
|
|
|
|
greh->protocol = pt_ns_type_be(params->flow->packet_type);
|
|
|
|
|
} else {
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return EINVAL;
|
2017-06-02 16:16:21 +00:00
|
|
|
|
}
|
2016-05-17 17:31:33 -07:00
|
|
|
|
greh->flags = 0;
|
|
|
|
|
|
|
|
|
|
options = (ovs_16aligned_be32 *) (greh + 1);
|
2016-05-23 20:27:14 -07:00
|
|
|
|
if (params->flow->tunnel.flags & FLOW_TNL_F_CSUM) {
|
2016-05-17 17:31:33 -07:00
|
|
|
|
greh->flags |= htons(GRE_CSUM);
|
|
|
|
|
put_16aligned_be32(options, 0);
|
|
|
|
|
options++;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-20 01:35:26 +02:00
|
|
|
|
tnl_cfg = netdev_get_tunnel_config(netdev);
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
if (tnl_cfg->out_key_present) {
|
|
|
|
|
greh->flags |= htons(GRE_KEY);
|
2016-05-26 16:53:52 -07:00
|
|
|
|
put_16aligned_be32(options, be64_to_be32(params->flow->tunnel.tun_id));
|
2016-05-17 17:31:33 -07:00
|
|
|
|
options++;
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-15 16:10:49 -04:00
|
|
|
|
if (tnl_cfg->set_seq) {
|
|
|
|
|
greh->flags |= htons(GRE_SEQ);
|
|
|
|
|
/* seqno is updated at push header */
|
|
|
|
|
options++;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
hlen = (uint8_t *) options - (uint8_t *) greh;
|
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
data->header_len += hlen;
|
2018-05-04 10:14:44 -07:00
|
|
|
|
if (!params->is_ipv6) {
|
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_GRE;
|
|
|
|
|
} else {
|
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_IP6GRE;
|
|
|
|
|
}
|
2016-05-17 17:31:33 -07:00
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-15 16:10:48 -04:00
|
|
|
|
struct dp_packet *
|
|
|
|
|
netdev_erspan_pop_header(struct dp_packet *packet)
|
|
|
|
|
{
|
|
|
|
|
const struct gre_base_hdr *greh;
|
|
|
|
|
const struct erspan_base_hdr *ersh;
|
|
|
|
|
struct pkt_metadata *md = &packet->md;
|
|
|
|
|
struct flow_tnl *tnl = &md->tunnel;
|
|
|
|
|
int hlen = sizeof(struct eth_header);
|
|
|
|
|
unsigned int ulen;
|
|
|
|
|
uint16_t greh_protocol;
|
|
|
|
|
|
|
|
|
|
hlen += netdev_tnl_is_header_ipv6(dp_packet_data(packet)) ?
|
|
|
|
|
IPV6_HEADER_LEN : IP_HEADER_LEN;
|
|
|
|
|
|
|
|
|
|
pkt_metadata_init_tnl(md);
|
|
|
|
|
if (hlen > dp_packet_size(packet)) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-13 13:43:35 +01:00
|
|
|
|
greh = ip_extract_tnl_md(packet, tnl, &ulen);
|
2018-05-15 16:10:48 -04:00
|
|
|
|
if (!greh) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
greh_protocol = ntohs(greh->protocol);
|
|
|
|
|
if (greh_protocol != ETH_TYPE_ERSPAN1 &&
|
|
|
|
|
greh_protocol != ETH_TYPE_ERSPAN2) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (greh->flags & ~htons(GRE_SEQ)) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ersh = ERSPAN_HDR(greh);
|
2018-05-04 10:14:44 -07:00
|
|
|
|
tnl->tun_id = be16_to_be64(htons(get_sid(ersh)));
|
2018-05-15 16:10:48 -04:00
|
|
|
|
tnl->erspan_ver = ersh->ver;
|
|
|
|
|
|
|
|
|
|
if (ersh->ver == 1) {
|
|
|
|
|
ovs_16aligned_be32 *index = ALIGNED_CAST(ovs_16aligned_be32 *,
|
|
|
|
|
ersh + 1);
|
|
|
|
|
tnl->erspan_idx = ntohl(get_16aligned_be32(index));
|
|
|
|
|
tnl->flags |= FLOW_TNL_F_KEY;
|
|
|
|
|
hlen = ulen + ERSPAN_GREHDR_LEN + sizeof *ersh + ERSPAN_V1_MDSIZE;
|
|
|
|
|
} else if (ersh->ver == 2) {
|
|
|
|
|
struct erspan_md2 *md2 = ALIGNED_CAST(struct erspan_md2 *, ersh + 1);
|
|
|
|
|
tnl->erspan_dir = md2->dir;
|
|
|
|
|
tnl->erspan_hwid = get_hwid(md2);
|
|
|
|
|
tnl->flags |= FLOW_TNL_F_KEY;
|
|
|
|
|
hlen = ulen + ERSPAN_GREHDR_LEN + sizeof *ersh + ERSPAN_V2_MDSIZE;
|
|
|
|
|
} else {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "ERSPAN version error %d", ersh->ver);
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hlen > dp_packet_size(packet)) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_pop(packet, hlen);
|
2018-05-15 16:10:48 -04:00
|
|
|
|
|
|
|
|
|
return packet;
|
|
|
|
|
err:
|
|
|
|
|
dp_packet_delete(packet);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
netdev_erspan_push_header(const struct netdev *netdev,
|
|
|
|
|
struct dp_packet *packet,
|
|
|
|
|
const struct ovs_action_push_tnl *data)
|
|
|
|
|
{
|
|
|
|
|
struct netdev_vport *dev = netdev_vport_cast(netdev);
|
|
|
|
|
struct erspan_base_hdr *ersh;
|
|
|
|
|
struct gre_base_hdr *greh;
|
|
|
|
|
struct erspan_md2 *md2;
|
|
|
|
|
int ip_tot_size;
|
|
|
|
|
|
2023-05-23 12:58:21 +09:00
|
|
|
|
greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
|
|
|
|
|
&ip_tot_size, 0);
|
2018-05-15 16:10:48 -04:00
|
|
|
|
|
|
|
|
|
/* update GRE seqno */
|
|
|
|
|
ovs_16aligned_be32 *seqno = (ovs_16aligned_be32 *) (greh + 1);
|
2023-05-19 22:05:37 +02:00
|
|
|
|
put_16aligned_be32(seqno, htonl(atomic_count_inc(&dev->gre_seqno)));
|
2018-05-15 16:10:48 -04:00
|
|
|
|
|
|
|
|
|
/* update v2 timestamp */
|
|
|
|
|
if (greh->protocol == htons(ETH_TYPE_ERSPAN2)) {
|
|
|
|
|
ersh = ERSPAN_HDR(greh);
|
|
|
|
|
md2 = ALIGNED_CAST(struct erspan_md2 *, ersh + 1);
|
|
|
|
|
put_16aligned_be32(&md2->timestamp, get_erspan_ts(ERSPAN_100US));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
netdev_erspan_build_header(const struct netdev *netdev,
|
2019-12-03 13:37:56 -08:00
|
|
|
|
struct ovs_action_push_tnl *data,
|
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
2018-05-15 16:10:48 -04:00
|
|
|
|
{
|
2023-05-20 01:35:26 +02:00
|
|
|
|
const struct netdev_tunnel_config *tnl_cfg;
|
2018-05-15 16:10:48 -04:00
|
|
|
|
struct gre_base_hdr *greh;
|
|
|
|
|
struct erspan_base_hdr *ersh;
|
|
|
|
|
unsigned int hlen;
|
|
|
|
|
uint32_t tun_id;
|
2018-05-17 17:46:41 -07:00
|
|
|
|
int erspan_ver;
|
2018-05-15 16:10:48 -04:00
|
|
|
|
uint16_t sid;
|
|
|
|
|
|
2023-05-23 12:58:22 +09:00
|
|
|
|
greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE, 0);
|
2018-05-15 16:10:48 -04:00
|
|
|
|
ersh = ERSPAN_HDR(greh);
|
|
|
|
|
|
|
|
|
|
tun_id = ntohl(be64_to_be32(params->flow->tunnel.tun_id));
|
|
|
|
|
/* ERSPAN only has 10-bit session ID */
|
|
|
|
|
if (tun_id & ~ERSPAN_SID_MASK) {
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return EINVAL;
|
2018-05-15 16:10:48 -04:00
|
|
|
|
} else {
|
|
|
|
|
sid = (uint16_t) tun_id;
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-20 01:35:26 +02:00
|
|
|
|
tnl_cfg = netdev_get_tunnel_config(netdev);
|
|
|
|
|
|
2018-05-17 17:46:41 -07:00
|
|
|
|
if (tnl_cfg->erspan_ver_flow) {
|
|
|
|
|
erspan_ver = params->flow->tunnel.erspan_ver;
|
|
|
|
|
} else {
|
|
|
|
|
erspan_ver = tnl_cfg->erspan_ver;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (erspan_ver == 1) {
|
2018-05-15 16:10:48 -04:00
|
|
|
|
greh->protocol = htons(ETH_TYPE_ERSPAN1);
|
|
|
|
|
greh->flags = htons(GRE_SEQ);
|
|
|
|
|
ersh->ver = 1;
|
|
|
|
|
set_sid(ersh, sid);
|
|
|
|
|
|
2018-05-23 19:13:56 -07:00
|
|
|
|
uint32_t erspan_idx = (tnl_cfg->erspan_idx_flow
|
|
|
|
|
? params->flow->tunnel.erspan_idx
|
|
|
|
|
: tnl_cfg->erspan_idx);
|
2018-05-15 16:10:48 -04:00
|
|
|
|
put_16aligned_be32(ALIGNED_CAST(ovs_16aligned_be32 *, ersh + 1),
|
2018-05-23 19:13:56 -07:00
|
|
|
|
htonl(erspan_idx));
|
2018-05-17 17:46:41 -07:00
|
|
|
|
|
2018-05-15 16:10:48 -04:00
|
|
|
|
hlen = ERSPAN_GREHDR_LEN + sizeof *ersh + ERSPAN_V1_MDSIZE;
|
2018-05-17 17:46:41 -07:00
|
|
|
|
} else if (erspan_ver == 2) {
|
|
|
|
|
struct erspan_md2 *md2 = ALIGNED_CAST(struct erspan_md2 *, ersh + 1);
|
|
|
|
|
|
2018-05-15 16:10:48 -04:00
|
|
|
|
greh->protocol = htons(ETH_TYPE_ERSPAN2);
|
|
|
|
|
greh->flags = htons(GRE_SEQ);
|
|
|
|
|
ersh->ver = 2;
|
|
|
|
|
set_sid(ersh, sid);
|
|
|
|
|
|
|
|
|
|
md2->sgt = 0; /* security group tag */
|
|
|
|
|
md2->gra = 0;
|
|
|
|
|
put_16aligned_be32(&md2->timestamp, 0);
|
2018-05-17 17:46:41 -07:00
|
|
|
|
|
|
|
|
|
if (tnl_cfg->erspan_hwid_flow) {
|
|
|
|
|
set_hwid(md2, params->flow->tunnel.erspan_hwid);
|
|
|
|
|
} else {
|
|
|
|
|
set_hwid(md2, tnl_cfg->erspan_hwid);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (tnl_cfg->erspan_dir_flow) {
|
|
|
|
|
md2->dir = params->flow->tunnel.erspan_dir;
|
|
|
|
|
} else {
|
|
|
|
|
md2->dir = tnl_cfg->erspan_dir;
|
|
|
|
|
}
|
2018-05-15 16:10:48 -04:00
|
|
|
|
|
|
|
|
|
hlen = ERSPAN_GREHDR_LEN + sizeof *ersh + ERSPAN_V2_MDSIZE;
|
|
|
|
|
} else {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "ERSPAN version error %d", tnl_cfg->erspan_ver);
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return EINVAL;
|
2018-05-15 16:10:48 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data->header_len += hlen;
|
|
|
|
|
|
|
|
|
|
if (params->is_ipv6) {
|
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_IP6ERSPAN;
|
|
|
|
|
} else {
|
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_ERSPAN;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-25 11:19:23 -08:00
|
|
|
|
struct dp_packet *
|
|
|
|
|
netdev_gtpu_pop_header(struct dp_packet *packet)
|
|
|
|
|
{
|
|
|
|
|
struct pkt_metadata *md = &packet->md;
|
|
|
|
|
struct flow_tnl *tnl = &md->tunnel;
|
|
|
|
|
struct gtpuhdr *gtph;
|
|
|
|
|
unsigned int gtpu_hlen;
|
|
|
|
|
unsigned int hlen;
|
|
|
|
|
|
|
|
|
|
ovs_assert(packet->l3_ofs > 0);
|
|
|
|
|
ovs_assert(packet->l4_ofs > 0);
|
|
|
|
|
|
|
|
|
|
pkt_metadata_init_tnl(md);
|
|
|
|
|
if (GTPU_HLEN > dp_packet_l4_size(packet)) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
gtph = udp_extract_tnl_md(packet, tnl, &hlen);
|
|
|
|
|
if (!gtph) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tnl->gtpu_flags = gtph->md.flags;
|
|
|
|
|
tnl->gtpu_msgtype = gtph->md.msgtype;
|
|
|
|
|
tnl->tun_id = be32_to_be64(get_16aligned_be32(>ph->teid));
|
|
|
|
|
|
|
|
|
|
if (tnl->gtpu_msgtype == GTPU_MSGTYPE_GPDU) {
|
|
|
|
|
struct ip_header *ip;
|
|
|
|
|
|
|
|
|
|
if (gtph->md.flags & GTPU_S_MASK) {
|
|
|
|
|
gtpu_hlen = GTPU_HLEN + sizeof(struct gtpuhdr_opt);
|
|
|
|
|
} else {
|
|
|
|
|
gtpu_hlen = GTPU_HLEN;
|
|
|
|
|
}
|
|
|
|
|
ip = ALIGNED_CAST(struct ip_header *, (char *)gtph + gtpu_hlen);
|
|
|
|
|
|
|
|
|
|
if (IP_VER(ip->ip_ihl_ver) == 4) {
|
|
|
|
|
packet->packet_type = htonl(PT_IPV4);
|
|
|
|
|
} else if (IP_VER(ip->ip_ihl_ver) == 6) {
|
|
|
|
|
packet->packet_type = htonl(PT_IPV6);
|
|
|
|
|
} else {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "GTP-U: Receive non-IP packet.");
|
|
|
|
|
}
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_pop(packet, hlen + gtpu_hlen);
|
2019-11-25 11:19:23 -08:00
|
|
|
|
} else {
|
|
|
|
|
/* non-GPDU GTP-U messages, ex: echo request, end marker.
|
|
|
|
|
* Users should redirect these packets to controller, or.
|
|
|
|
|
* any application that handles GTP-U messages, so keep
|
|
|
|
|
* the original packet.
|
|
|
|
|
*/
|
|
|
|
|
packet->packet_type = htonl(PT_ETH);
|
|
|
|
|
VLOG_WARN_ONCE("Receive non-GPDU msgtype: %"PRIu8,
|
|
|
|
|
gtph->md.msgtype);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return packet;
|
|
|
|
|
|
|
|
|
|
err:
|
|
|
|
|
dp_packet_delete(packet);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
netdev_gtpu_push_header(const struct netdev *netdev,
|
|
|
|
|
struct dp_packet *packet,
|
|
|
|
|
const struct ovs_action_push_tnl *data)
|
|
|
|
|
{
|
|
|
|
|
struct netdev_vport *dev = netdev_vport_cast(netdev);
|
|
|
|
|
struct udp_header *udp;
|
|
|
|
|
struct gtpuhdr *gtpuh;
|
netdev-native-tnl: Fix use of uninitialized RSS hash.
RSS hash calculation for a packet may be skipped in some cases. One
of them is a simple match optimization. Packet is not fully parsed
for the simple match, so there is no enough data to calculate the full
5-tuple hash. However, when such a packet needs tunnel encapsulation,
we need RSS hash to calculate the source port for the outer UDP header.
And netdev_tnl_get_src_port() function doesn't check if the hash is
valid before using it. So, such packets will likely end up with
different and unpredictable source ports potentially causing packet
reordering or other issues in the network:
WARNING: MemorySanitizer: use-of-uninitialized-value
0 0x10c129c in dp_packet_get_rss_hash lib/dp-packet.h:1029:5
1 0x10b264c in netdev_tnl_get_src_port lib/netdev-native-tnl.h:131:12
2 0x10b171a in netdev_tnl_push_udp_header lib/netdev-native-tnl.c:286:20
3 0xb772fe in netdev_push_header lib/netdev.c:1037:13
4 0x9673c4 in push_tnl_action lib/dpif-netdev.c:9067:11
5 0x961abe in dp_execute_cb lib/dpif-netdev.c:9226:13
6 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
7 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
8 0x968f3f in dp_execute_userspace_action lib/dpif-netdev.c:9093:9
9 0x962e54 in dp_execute_cb lib/dpif-netdev.c:9307:17
10 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
11 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
12 0x950fef in packet_batch_per_flow_execute lib/dpif-netdev.c:8271:5
13 0x8ec8db in dp_netdev_input__ lib/dpif-netdev.c:8899:9
14 0x8eb8ec in dp_netdev_input lib/dpif-netdev.c:8908:5
15 0x92d5e8 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5660:19
16 0x8ee2c4 in dpif_netdev_run lib/dpif-netdev.c:6993:25
17 0x9b442f in dpif_run lib/dpif.c:471:16
18 0x5f8e3a in type_run ofproto/ofproto-dpif.c:367:9
19 0x56c508 in ofproto_type_run ofproto/ofproto.c:1879:31
20 0x4cb388 in bridge_run__ vswitchd/bridge.c:3281:9
21 0x4c9b00 in bridge_run vswitchd/bridge.c:3346:5
22 0x526043 in main vswitchd/ovs-vswitchd.c:130:9
23 0x7f1192 in __libc_start_call_main
24 0x7f1192 in __libc_start_main@GLIBC_2.2.5
25 0x432b24 in _start (vswitchd/ovs-vswitchd+0x432b24)
The issue is caught by running the 'debug_slow' test under the memory
sanitizer. Another way to reproduce is by sending two packets at once
through the datapath. The first one will get the same memory chunk as
the upcalled packet with already calculated RSS, the second one will
get the brand new memory chunk without the calculated RSS, so these
two packets will have different source ports after encapsulation.
The test is updated to cover this case.
Fix the issue by checking if the hash is valid before using, re-parsing
and calculating if it is not. The netdev_tnl_get_src_port() function
moved to the .c file, since there is no real reason for it to be in the
header. Compiler can decide on inlining it. The declaration kept in
the header, since all the other functions declared there, even if there
is no reason for that.
In the future we may want to consolidate all the places where we
re-calculate RSS hash into a single function, but it's a little tricky.
This is also a larger change that would be harder to backport. So, not
touching that aspect for now.
Re-parsing the packet eliminates advantages of the simple match, but
it was designed primarily for very simple setups that do not involve
tunneling or any other complex processing, so it should not be a big
problem. And simple match can still be used with tunneling when the
input port provides the RSS hash.
Also, checking if the hash is valid is a right thing to do anyways.
Next step might be to not use simple match when there is no RSS hash
and there is a tunnel push action, but it seems hard to implement,
especially since we don't know the actions until we lookup the flow.
Fixes: e7e9973b80d3 ("dpif-netdev: Forwarding optimization for flows with a simple match.")
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-11-29 17:36:45 +01:00
|
|
|
|
ovs_be16 udp_src;
|
2019-11-25 11:19:23 -08:00
|
|
|
|
int ip_tot_size;
|
|
|
|
|
unsigned int payload_len;
|
|
|
|
|
|
netdev-native-tnl: Fix use of uninitialized RSS hash.
RSS hash calculation for a packet may be skipped in some cases. One
of them is a simple match optimization. Packet is not fully parsed
for the simple match, so there is no enough data to calculate the full
5-tuple hash. However, when such a packet needs tunnel encapsulation,
we need RSS hash to calculate the source port for the outer UDP header.
And netdev_tnl_get_src_port() function doesn't check if the hash is
valid before using it. So, such packets will likely end up with
different and unpredictable source ports potentially causing packet
reordering or other issues in the network:
WARNING: MemorySanitizer: use-of-uninitialized-value
0 0x10c129c in dp_packet_get_rss_hash lib/dp-packet.h:1029:5
1 0x10b264c in netdev_tnl_get_src_port lib/netdev-native-tnl.h:131:12
2 0x10b171a in netdev_tnl_push_udp_header lib/netdev-native-tnl.c:286:20
3 0xb772fe in netdev_push_header lib/netdev.c:1037:13
4 0x9673c4 in push_tnl_action lib/dpif-netdev.c:9067:11
5 0x961abe in dp_execute_cb lib/dpif-netdev.c:9226:13
6 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
7 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
8 0x968f3f in dp_execute_userspace_action lib/dpif-netdev.c:9093:9
9 0x962e54 in dp_execute_cb lib/dpif-netdev.c:9307:17
10 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
11 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
12 0x950fef in packet_batch_per_flow_execute lib/dpif-netdev.c:8271:5
13 0x8ec8db in dp_netdev_input__ lib/dpif-netdev.c:8899:9
14 0x8eb8ec in dp_netdev_input lib/dpif-netdev.c:8908:5
15 0x92d5e8 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5660:19
16 0x8ee2c4 in dpif_netdev_run lib/dpif-netdev.c:6993:25
17 0x9b442f in dpif_run lib/dpif.c:471:16
18 0x5f8e3a in type_run ofproto/ofproto-dpif.c:367:9
19 0x56c508 in ofproto_type_run ofproto/ofproto.c:1879:31
20 0x4cb388 in bridge_run__ vswitchd/bridge.c:3281:9
21 0x4c9b00 in bridge_run vswitchd/bridge.c:3346:5
22 0x526043 in main vswitchd/ovs-vswitchd.c:130:9
23 0x7f1192 in __libc_start_call_main
24 0x7f1192 in __libc_start_main@GLIBC_2.2.5
25 0x432b24 in _start (vswitchd/ovs-vswitchd+0x432b24)
The issue is caught by running the 'debug_slow' test under the memory
sanitizer. Another way to reproduce is by sending two packets at once
through the datapath. The first one will get the same memory chunk as
the upcalled packet with already calculated RSS, the second one will
get the brand new memory chunk without the calculated RSS, so these
two packets will have different source ports after encapsulation.
The test is updated to cover this case.
Fix the issue by checking if the hash is valid before using, re-parsing
and calculating if it is not. The netdev_tnl_get_src_port() function
moved to the .c file, since there is no real reason for it to be in the
header. Compiler can decide on inlining it. The declaration kept in
the header, since all the other functions declared there, even if there
is no reason for that.
In the future we may want to consolidate all the places where we
re-calculate RSS hash into a single function, but it's a little tricky.
This is also a larger change that would be harder to backport. So, not
touching that aspect for now.
Re-parsing the packet eliminates advantages of the simple match, but
it was designed primarily for very simple setups that do not involve
tunneling or any other complex processing, so it should not be a big
problem. And simple match can still be used with tunneling when the
input port provides the RSS hash.
Also, checking if the hash is valid is a right thing to do anyways.
Next step might be to not use simple match when there is no RSS hash
and there is a tunnel push action, but it seems hard to implement,
especially since we don't know the actions until we lookup the flow.
Fixes: e7e9973b80d3 ("dpif-netdev: Forwarding optimization for flows with a simple match.")
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-11-29 17:36:45 +01:00
|
|
|
|
/* We may need to re-calculate the hash and this has to be done before
|
|
|
|
|
* modifying the packet. */
|
|
|
|
|
udp_src = netdev_tnl_get_src_port(packet);
|
|
|
|
|
|
2019-11-25 11:19:23 -08:00
|
|
|
|
payload_len = dp_packet_size(packet);
|
2023-05-23 12:58:21 +09:00
|
|
|
|
udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
|
|
|
|
|
&ip_tot_size, 0);
|
netdev-native-tnl: Fix use of uninitialized RSS hash.
RSS hash calculation for a packet may be skipped in some cases. One
of them is a simple match optimization. Packet is not fully parsed
for the simple match, so there is no enough data to calculate the full
5-tuple hash. However, when such a packet needs tunnel encapsulation,
we need RSS hash to calculate the source port for the outer UDP header.
And netdev_tnl_get_src_port() function doesn't check if the hash is
valid before using it. So, such packets will likely end up with
different and unpredictable source ports potentially causing packet
reordering or other issues in the network:
WARNING: MemorySanitizer: use-of-uninitialized-value
0 0x10c129c in dp_packet_get_rss_hash lib/dp-packet.h:1029:5
1 0x10b264c in netdev_tnl_get_src_port lib/netdev-native-tnl.h:131:12
2 0x10b171a in netdev_tnl_push_udp_header lib/netdev-native-tnl.c:286:20
3 0xb772fe in netdev_push_header lib/netdev.c:1037:13
4 0x9673c4 in push_tnl_action lib/dpif-netdev.c:9067:11
5 0x961abe in dp_execute_cb lib/dpif-netdev.c:9226:13
6 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
7 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
8 0x968f3f in dp_execute_userspace_action lib/dpif-netdev.c:9093:9
9 0x962e54 in dp_execute_cb lib/dpif-netdev.c:9307:17
10 0xbcb4b1 in odp_execute_actions lib/odp-execute.c:1008:17
11 0x8e939f in dp_netdev_execute_actions lib/dpif-netdev.c:9524:5
12 0x950fef in packet_batch_per_flow_execute lib/dpif-netdev.c:8271:5
13 0x8ec8db in dp_netdev_input__ lib/dpif-netdev.c:8899:9
14 0x8eb8ec in dp_netdev_input lib/dpif-netdev.c:8908:5
15 0x92d5e8 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5660:19
16 0x8ee2c4 in dpif_netdev_run lib/dpif-netdev.c:6993:25
17 0x9b442f in dpif_run lib/dpif.c:471:16
18 0x5f8e3a in type_run ofproto/ofproto-dpif.c:367:9
19 0x56c508 in ofproto_type_run ofproto/ofproto.c:1879:31
20 0x4cb388 in bridge_run__ vswitchd/bridge.c:3281:9
21 0x4c9b00 in bridge_run vswitchd/bridge.c:3346:5
22 0x526043 in main vswitchd/ovs-vswitchd.c:130:9
23 0x7f1192 in __libc_start_call_main
24 0x7f1192 in __libc_start_main@GLIBC_2.2.5
25 0x432b24 in _start (vswitchd/ovs-vswitchd+0x432b24)
The issue is caught by running the 'debug_slow' test under the memory
sanitizer. Another way to reproduce is by sending two packets at once
through the datapath. The first one will get the same memory chunk as
the upcalled packet with already calculated RSS, the second one will
get the brand new memory chunk without the calculated RSS, so these
two packets will have different source ports after encapsulation.
The test is updated to cover this case.
Fix the issue by checking if the hash is valid before using, re-parsing
and calculating if it is not. The netdev_tnl_get_src_port() function
moved to the .c file, since there is no real reason for it to be in the
header. Compiler can decide on inlining it. The declaration kept in
the header, since all the other functions declared there, even if there
is no reason for that.
In the future we may want to consolidate all the places where we
re-calculate RSS hash into a single function, but it's a little tricky.
This is also a larger change that would be harder to backport. So, not
touching that aspect for now.
Re-parsing the packet eliminates advantages of the simple match, but
it was designed primarily for very simple setups that do not involve
tunneling or any other complex processing, so it should not be a big
problem. And simple match can still be used with tunneling when the
input port provides the RSS hash.
Also, checking if the hash is valid is a right thing to do anyways.
Next step might be to not use simple match when there is no RSS hash
and there is a tunnel push action, but it seems hard to implement,
especially since we don't know the actions until we lookup the flow.
Fixes: e7e9973b80d3 ("dpif-netdev: Forwarding optimization for flows with a simple match.")
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-11-29 17:36:45 +01:00
|
|
|
|
udp->udp_src = udp_src;
|
2019-11-25 11:19:23 -08:00
|
|
|
|
udp->udp_len = htons(ip_tot_size);
|
2023-06-14 15:03:27 -04:00
|
|
|
|
/* Postpone checksum to the egress netdev. */
|
2025-06-17 09:20:58 +02:00
|
|
|
|
dp_packet_l4_proto_set_udp(packet);
|
|
|
|
|
dp_packet_l4_checksum_set_partial(packet);
|
2019-11-25 11:19:23 -08:00
|
|
|
|
|
|
|
|
|
gtpuh = ALIGNED_CAST(struct gtpuhdr *, udp + 1);
|
|
|
|
|
|
2023-05-19 22:05:37 +02:00
|
|
|
|
if (gtpuh->md.flags & GTPU_S_MASK) {
|
2019-11-25 11:19:23 -08:00
|
|
|
|
ovs_be16 *seqno = ALIGNED_CAST(ovs_be16 *, gtpuh + 1);
|
2023-05-19 22:05:37 +02:00
|
|
|
|
*seqno = htons(atomic_count_inc(&dev->gre_seqno));
|
2019-11-25 11:19:23 -08:00
|
|
|
|
payload_len += sizeof(struct gtpuhdr_opt);
|
|
|
|
|
}
|
|
|
|
|
gtpuh->len = htons(payload_len);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
netdev_gtpu_build_header(const struct netdev *netdev,
|
|
|
|
|
struct ovs_action_push_tnl *data,
|
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
|
|
|
|
{
|
2023-05-20 01:35:26 +02:00
|
|
|
|
const struct netdev_tunnel_config *tnl_cfg;
|
2019-11-25 11:19:23 -08:00
|
|
|
|
struct gtpuhdr *gtph;
|
|
|
|
|
unsigned int gtpu_hlen;
|
|
|
|
|
|
2023-05-20 01:35:26 +02:00
|
|
|
|
tnl_cfg = netdev_get_tunnel_config(netdev);
|
|
|
|
|
|
2019-11-25 11:19:23 -08:00
|
|
|
|
gtph = udp_build_header(tnl_cfg, data, params);
|
|
|
|
|
|
|
|
|
|
/* Set to default if not set in flow. */
|
|
|
|
|
gtph->md.flags = params->flow->tunnel.gtpu_flags ?
|
|
|
|
|
params->flow->tunnel.gtpu_flags : GTPU_FLAGS_DEFAULT;
|
|
|
|
|
gtph->md.msgtype = params->flow->tunnel.gtpu_msgtype ?
|
|
|
|
|
params->flow->tunnel.gtpu_msgtype : GTPU_MSGTYPE_GPDU;
|
|
|
|
|
put_16aligned_be32(>ph->teid,
|
|
|
|
|
be64_to_be32(params->flow->tunnel.tun_id));
|
|
|
|
|
|
|
|
|
|
gtpu_hlen = sizeof *gtph;
|
|
|
|
|
if (tnl_cfg->set_seq) {
|
|
|
|
|
gtph->md.flags |= GTPU_S_MASK;
|
|
|
|
|
gtpu_hlen += sizeof(struct gtpuhdr_opt);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data->header_len += gtpu_hlen;
|
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_GTPU;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-29 14:51:17 +09:00
|
|
|
|
int
|
|
|
|
|
netdev_srv6_build_header(const struct netdev *netdev,
|
|
|
|
|
struct ovs_action_push_tnl *data,
|
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
|
|
|
|
{
|
2023-05-20 01:35:26 +02:00
|
|
|
|
const struct netdev_tunnel_config *tnl_cfg;
|
2024-05-17 20:33:03 +02:00
|
|
|
|
union ovs_16aligned_in6_addr *s;
|
2023-03-29 14:51:17 +09:00
|
|
|
|
const struct in6_addr *segs;
|
|
|
|
|
struct srv6_base_hdr *srh;
|
|
|
|
|
ovs_be16 dl_type;
|
|
|
|
|
int nr_segs;
|
|
|
|
|
int i;
|
|
|
|
|
|
2023-05-20 01:35:26 +02:00
|
|
|
|
tnl_cfg = netdev_get_tunnel_config(netdev);
|
2023-03-29 14:51:17 +09:00
|
|
|
|
if (tnl_cfg->srv6_num_segs) {
|
|
|
|
|
nr_segs = tnl_cfg->srv6_num_segs;
|
|
|
|
|
segs = tnl_cfg->srv6_segs;
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* If explicit segment list setting is omitted, tunnel destination
|
|
|
|
|
* is considered to be the first segment list.
|
|
|
|
|
*/
|
|
|
|
|
nr_segs = 1;
|
|
|
|
|
segs = ¶ms->flow->tunnel.ipv6_dst;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!ipv6_addr_equals(&segs[0], ¶ms->flow->tunnel.ipv6_dst)) {
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return EINVAL;
|
2023-03-29 14:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
2023-05-23 12:58:23 +09:00
|
|
|
|
/* Writes the netdev_srv6_flowlabel enum value to the ipv6
|
|
|
|
|
* flowlabel field. It must later be replaced by a valid value
|
|
|
|
|
* in the header push. */
|
|
|
|
|
srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING,
|
|
|
|
|
htonl(tnl_cfg->srv6_flowlabel));
|
|
|
|
|
|
2023-03-29 14:51:17 +09:00
|
|
|
|
srh->rt_hdr.segments_left = nr_segs - 1;
|
|
|
|
|
srh->rt_hdr.type = IPV6_SRCRT_TYPE_4;
|
|
|
|
|
srh->rt_hdr.hdrlen = 2 * nr_segs;
|
|
|
|
|
srh->last_entry = nr_segs - 1;
|
|
|
|
|
srh->flags = 0;
|
|
|
|
|
srh->tag = 0;
|
|
|
|
|
|
|
|
|
|
dl_type = params->flow->dl_type;
|
|
|
|
|
if (dl_type == htons(ETH_TYPE_IP)) {
|
|
|
|
|
srh->rt_hdr.nexthdr = IPPROTO_IPIP;
|
|
|
|
|
} else if (dl_type == htons(ETH_TYPE_IPV6)) {
|
|
|
|
|
srh->rt_hdr.nexthdr = IPPROTO_IPV6;
|
|
|
|
|
} else {
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return EOPNOTSUPP;
|
2023-03-29 14:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
2024-05-17 20:33:03 +02:00
|
|
|
|
s = (union ovs_16aligned_in6_addr *) (srh + 1);
|
2023-03-29 14:51:17 +09:00
|
|
|
|
for (i = 0; i < nr_segs; i++) {
|
|
|
|
|
/* Segment list is written to the header in reverse order. */
|
|
|
|
|
memcpy(s, &segs[nr_segs - i - 1], sizeof *s);
|
|
|
|
|
s++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data->header_len += sizeof *srh + 8 * srh->rt_hdr.hdrlen;
|
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_SRV6;
|
|
|
|
|
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return 0;
|
2023-03-29 14:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
netdev_srv6_push_header(const struct netdev *netdev OVS_UNUSED,
|
|
|
|
|
struct dp_packet *packet,
|
|
|
|
|
const struct ovs_action_push_tnl *data)
|
|
|
|
|
{
|
2023-05-23 12:58:23 +09:00
|
|
|
|
struct ovs_16aligned_ip6_hdr *inner_ip6, *outer_ip6;
|
|
|
|
|
enum netdev_srv6_flowlabel srv6_flowlabel;
|
|
|
|
|
ovs_be32 ipv6_label = 0;
|
2023-03-29 14:51:17 +09:00
|
|
|
|
int ip_tot_size;
|
2023-05-23 12:58:23 +09:00
|
|
|
|
uint32_t flow;
|
|
|
|
|
|
|
|
|
|
inner_ip6 = dp_packet_l3(packet);
|
|
|
|
|
outer_ip6 = netdev_tnl_ipv6_hdr((void *) data->header);
|
|
|
|
|
srv6_flowlabel = ntohl(get_16aligned_be32(&outer_ip6->ip6_flow)) &
|
|
|
|
|
IPV6_LABEL_MASK;
|
|
|
|
|
|
|
|
|
|
switch (srv6_flowlabel) {
|
|
|
|
|
case SRV6_FLOWLABEL_COPY:
|
|
|
|
|
flow = ntohl(get_16aligned_be32(&inner_ip6->ip6_flow));
|
|
|
|
|
ipv6_label = (flow >> 28) == 6 ? htonl(flow & IPV6_LABEL_MASK) : 0;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case SRV6_FLOWLABEL_ZERO:
|
|
|
|
|
ipv6_label = 0;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case SRV6_FLOWLABEL_COMPUTE:
|
|
|
|
|
ipv6_label = htonl(dp_packet_get_rss_hash(packet) & IPV6_LABEL_MASK);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2023-03-29 14:51:17 +09:00
|
|
|
|
|
2023-05-23 12:58:23 +09:00
|
|
|
|
netdev_tnl_push_ip_header(packet, data->header,
|
|
|
|
|
data->header_len, &ip_tot_size, ipv6_label);
|
2023-03-29 14:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct dp_packet *
|
|
|
|
|
netdev_srv6_pop_header(struct dp_packet *packet)
|
|
|
|
|
{
|
|
|
|
|
const struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
|
|
|
|
|
struct pkt_metadata *md = &packet->md;
|
|
|
|
|
struct flow_tnl *tnl = &md->tunnel;
|
|
|
|
|
const struct ip6_rt_hdr *rt_hdr;
|
|
|
|
|
uint8_t nw_proto = nh->ip6_nxt;
|
|
|
|
|
const void *data = nh + 1;
|
|
|
|
|
uint8_t nw_frag = 0;
|
|
|
|
|
unsigned int hlen;
|
2024-08-28 15:28:39 +02:00
|
|
|
|
size_t size;
|
2023-03-29 14:51:17 +09:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Verifies that the routing header is present in the IPv6
|
|
|
|
|
* extension headers and that its type is SRv6.
|
|
|
|
|
*/
|
2024-08-28 15:28:39 +02:00
|
|
|
|
size = dp_packet_l3_size(packet);
|
|
|
|
|
if (size < IPV6_HEADER_LEN) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
size -= IPV6_HEADER_LEN;
|
|
|
|
|
|
2023-03-29 14:51:17 +09:00
|
|
|
|
if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag,
|
|
|
|
|
NULL, &rt_hdr)) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!rt_hdr || rt_hdr->type != IPV6_SRCRT_TYPE_4) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (rt_hdr->segments_left > 0) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "invalid srv6 segments_left=%d\n",
|
|
|
|
|
rt_hdr->segments_left);
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (rt_hdr->nexthdr == IPPROTO_IPIP) {
|
|
|
|
|
packet->packet_type = htonl(PT_IPV4);
|
|
|
|
|
} else if (rt_hdr->nexthdr == IPPROTO_IPV6) {
|
|
|
|
|
packet->packet_type = htonl(PT_IPV6);
|
|
|
|
|
} else {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pkt_metadata_init_tnl(md);
|
2025-03-13 13:43:35 +01:00
|
|
|
|
if (!ip_extract_tnl_md(packet, tnl, &hlen)) {
|
2024-05-27 15:08:42 -04:00
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_pop(packet, hlen);
|
2023-03-29 14:51:17 +09:00
|
|
|
|
|
|
|
|
|
return packet;
|
|
|
|
|
err:
|
|
|
|
|
dp_packet_delete(packet);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 17:32:06 -07:00
|
|
|
|
struct dp_packet *
|
2016-05-17 17:31:33 -07:00
|
|
|
|
netdev_vxlan_pop_header(struct dp_packet *packet)
|
|
|
|
|
{
|
|
|
|
|
struct pkt_metadata *md = &packet->md;
|
|
|
|
|
struct flow_tnl *tnl = &md->tunnel;
|
|
|
|
|
struct vxlanhdr *vxh;
|
|
|
|
|
unsigned int hlen;
|
2017-06-02 16:16:26 +00:00
|
|
|
|
ovs_be32 vx_flags;
|
|
|
|
|
enum packet_type next_pt = PT_ETH;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2018-01-12 17:43:13 +00:00
|
|
|
|
ovs_assert(packet->l3_ofs > 0);
|
|
|
|
|
ovs_assert(packet->l4_ofs > 0);
|
|
|
|
|
|
2016-05-17 17:31:33 -07:00
|
|
|
|
pkt_metadata_init_tnl(md);
|
|
|
|
|
if (VXLAN_HLEN > dp_packet_l4_size(packet)) {
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
vxh = udp_extract_tnl_md(packet, tnl, &hlen);
|
|
|
|
|
if (!vxh) {
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
2017-06-02 16:16:26 +00:00
|
|
|
|
vx_flags = get_16aligned_be32(&vxh->vx_flags);
|
|
|
|
|
if (vx_flags & htonl(VXLAN_HF_GPE)) {
|
|
|
|
|
vx_flags &= htonl(~VXLAN_GPE_USED_BITS);
|
|
|
|
|
/* Drop the OAM packets */
|
|
|
|
|
if (vxh->vx_gpe.flags & VXLAN_GPE_FLAGS_O) {
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
switch (vxh->vx_gpe.next_protocol) {
|
|
|
|
|
case VXLAN_GPE_NP_IPV4:
|
|
|
|
|
next_pt = PT_IPV4;
|
|
|
|
|
break;
|
|
|
|
|
case VXLAN_GPE_NP_IPV6:
|
|
|
|
|
next_pt = PT_IPV6;
|
|
|
|
|
break;
|
2017-08-05 13:41:10 +08:00
|
|
|
|
case VXLAN_GPE_NP_NSH:
|
|
|
|
|
next_pt = PT_NSH;
|
|
|
|
|
break;
|
2017-06-02 16:16:26 +00:00
|
|
|
|
case VXLAN_GPE_NP_ETHERNET:
|
|
|
|
|
next_pt = PT_ETH;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (vx_flags != htonl(VXLAN_FLAGS) ||
|
2016-05-17 17:31:33 -07:00
|
|
|
|
(get_16aligned_be32(&vxh->vx_vni) & htonl(0xff))) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "invalid vxlan flags=%#x vni=%#x\n",
|
2017-06-02 16:16:26 +00:00
|
|
|
|
ntohl(vx_flags),
|
2016-05-17 17:31:33 -07:00
|
|
|
|
ntohl(get_16aligned_be32(&vxh->vx_vni)));
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
tnl->tun_id = htonll(ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8);
|
|
|
|
|
tnl->flags |= FLOW_TNL_F_KEY;
|
|
|
|
|
|
2017-06-02 16:16:26 +00:00
|
|
|
|
packet->packet_type = htonl(next_pt);
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_pop(packet, hlen + VXLAN_HLEN);
|
2017-06-02 16:16:26 +00:00
|
|
|
|
if (next_pt != PT_ETH) {
|
|
|
|
|
packet->l3_ofs = 0;
|
|
|
|
|
}
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-17 17:32:06 -07:00
|
|
|
|
return packet;
|
|
|
|
|
err:
|
|
|
|
|
dp_packet_delete(packet);
|
|
|
|
|
return NULL;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
netdev_vxlan_build_header(const struct netdev *netdev,
|
|
|
|
|
struct ovs_action_push_tnl *data,
|
2016-05-23 20:27:14 -07:00
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
2016-05-17 17:31:33 -07:00
|
|
|
|
{
|
2023-05-20 01:35:26 +02:00
|
|
|
|
const struct netdev_tunnel_config *tnl_cfg;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
struct vxlanhdr *vxh;
|
|
|
|
|
|
2023-05-20 01:35:26 +02:00
|
|
|
|
tnl_cfg = netdev_get_tunnel_config(netdev);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
vxh = udp_build_header(tnl_cfg, data, params);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2017-06-02 16:16:26 +00:00
|
|
|
|
if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GPE)) {
|
|
|
|
|
put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS | VXLAN_HF_GPE));
|
|
|
|
|
put_16aligned_be32(&vxh->vx_vni,
|
|
|
|
|
htonl(ntohll(params->flow->tunnel.tun_id) << 8));
|
2017-06-23 16:47:59 +00:00
|
|
|
|
if (params->flow->packet_type == htonl(PT_ETH)) {
|
|
|
|
|
vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_ETHERNET;
|
|
|
|
|
} else if (pt_ns(params->flow->packet_type) == OFPHTN_ETHERTYPE) {
|
|
|
|
|
switch (pt_ns_type(params->flow->packet_type)) {
|
2017-06-02 16:16:26 +00:00
|
|
|
|
case ETH_TYPE_IP:
|
|
|
|
|
vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_IPV4;
|
|
|
|
|
break;
|
|
|
|
|
case ETH_TYPE_IPV6:
|
|
|
|
|
vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_IPV6;
|
|
|
|
|
break;
|
2017-08-05 13:41:10 +08:00
|
|
|
|
case ETH_TYPE_NSH:
|
|
|
|
|
vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_NSH;
|
|
|
|
|
break;
|
2017-06-02 16:16:26 +00:00
|
|
|
|
case ETH_TYPE_TEB:
|
|
|
|
|
vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_ETHERNET;
|
|
|
|
|
break;
|
2017-06-23 16:47:59 +00:00
|
|
|
|
default:
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return EINVAL;
|
2017-06-02 16:16:26 +00:00
|
|
|
|
}
|
|
|
|
|
} else {
|
2023-05-20 01:35:26 +02:00
|
|
|
|
return EINVAL;
|
2017-06-02 16:16:26 +00:00
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS));
|
|
|
|
|
put_16aligned_be32(&vxh->vx_vni,
|
|
|
|
|
htonl(ntohll(params->flow->tunnel.tun_id) << 8));
|
|
|
|
|
}
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
data->header_len += sizeof *vxh;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_VXLAN;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 17:32:06 -07:00
|
|
|
|
struct dp_packet *
|
2016-05-17 17:31:33 -07:00
|
|
|
|
netdev_geneve_pop_header(struct dp_packet *packet)
|
|
|
|
|
{
|
|
|
|
|
struct pkt_metadata *md = &packet->md;
|
|
|
|
|
struct flow_tnl *tnl = &md->tunnel;
|
|
|
|
|
struct genevehdr *gnh;
|
|
|
|
|
unsigned int hlen, opts_len, ulen;
|
|
|
|
|
|
|
|
|
|
pkt_metadata_init_tnl(md);
|
|
|
|
|
if (GENEVE_BASE_HLEN > dp_packet_l4_size(packet)) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "geneve packet too small: min header=%u packet size=%"PRIuSIZE"\n",
|
|
|
|
|
(unsigned int)GENEVE_BASE_HLEN, dp_packet_l4_size(packet));
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
gnh = udp_extract_tnl_md(packet, tnl, &ulen);
|
|
|
|
|
if (!gnh) {
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
opts_len = gnh->opt_len * 4;
|
|
|
|
|
hlen = ulen + GENEVE_BASE_HLEN + opts_len;
|
|
|
|
|
if (hlen > dp_packet_size(packet)) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "geneve packet too small: header len=%u packet size=%u\n",
|
|
|
|
|
hlen, dp_packet_size(packet));
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (gnh->ver != 0) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "unknown geneve version: %"PRIu8"\n", gnh->ver);
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (gnh->proto_type != htons(ETH_TYPE_TEB)) {
|
|
|
|
|
VLOG_WARN_RL(&err_rl, "unknown geneve encapsulated protocol: %#x\n",
|
|
|
|
|
ntohs(gnh->proto_type));
|
2016-05-17 17:32:06 -07:00
|
|
|
|
goto err;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tnl->flags |= gnh->oam ? FLOW_TNL_F_OAM : 0;
|
|
|
|
|
tnl->tun_id = htonll(ntohl(get_16aligned_be32(&gnh->vni)) >> 8);
|
|
|
|
|
tnl->flags |= FLOW_TNL_F_KEY;
|
|
|
|
|
|
|
|
|
|
memcpy(tnl->metadata.opts.gnv, gnh->options, opts_len);
|
|
|
|
|
tnl->metadata.present.len = opts_len;
|
|
|
|
|
tnl->flags |= FLOW_TNL_F_UDPIF;
|
|
|
|
|
|
2017-06-02 16:16:21 +00:00
|
|
|
|
packet->packet_type = htonl(PT_ETH);
|
dp-packet: Rework IP checksum offloads.
As the packet traverses through OVS, offloading Tx flags must be carefully
evaluated and updated which results in a bit of complexity because of a
separate "outer" Tx offloading flag coming from DPDK API,
and a "normal"/"inner" Tx offloading flag.
On the other hand, the DPDK mbuf API specifies 4 status when it comes to
IP checksums:
- RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum
- RTE_MBUF_F_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong
- RTE_MBUF_F_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid
- RTE_MBUF_F_RX_IP_CKSUM_NONE: the IP checksum is not correct in the
packet data, but the integrity of the IP header is verified.
This patch changes OVS API so that OVS code only tracks the status of
the checksum of the "current" L3 header and let the Tx flags aspect to
the netdev-* implementations.
With this API, the flow extraction can be cleaned up.
During packet processing, OVS can simply look for the IP checksum validity
(either good, or partial) before changing some IP header, and then mark
the checksum as partial.
In the conntrack case, when natting packets, the checksum status of the
inner part (ICMP error case) must be forced temporarily as unknown
to force checksum resolution.
When tunneling comes into play, IP checksums status is bit-shifted for
future considerations in the processing if, for example, the tunnel
header gets decapsulated again, or in the netdev-* implementations that
support tunnel offloading.
Finally, netdev-* implementations only need to care about packets in
partial status: a good checksum does not need touching, a bad checksum
has been updated by kept as bad by OVS, an unknown checksum is either
an IPv6 or if it was an IPv4, OVS updated it too (keeping it good or bad
accordingly).
Rename current API for consistency with dp_packet_(inner_)?ip_checksum_.
Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2025-06-17 09:20:57 +02:00
|
|
|
|
tnl_ol_pop(packet, hlen);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-17 17:32:06 -07:00
|
|
|
|
return packet;
|
|
|
|
|
err:
|
|
|
|
|
dp_packet_delete(packet);
|
|
|
|
|
return NULL;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
netdev_geneve_build_header(const struct netdev *netdev,
|
|
|
|
|
struct ovs_action_push_tnl *data,
|
2016-05-23 20:27:14 -07:00
|
|
|
|
const struct netdev_tnl_build_header_params *params)
|
2016-05-17 17:31:33 -07:00
|
|
|
|
{
|
|
|
|
|
struct genevehdr *gnh;
|
|
|
|
|
int opt_len;
|
|
|
|
|
bool crit_opt;
|
|
|
|
|
|
2023-05-20 01:35:26 +02:00
|
|
|
|
gnh = udp_build_header(netdev_get_tunnel_config(netdev), data, params);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
put_16aligned_be32(&gnh->vni, htonl(ntohll(params->flow->tunnel.tun_id) << 8));
|
2016-05-17 17:31:33 -07:00
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
opt_len = tun_metadata_to_geneve_header(¶ms->flow->tunnel,
|
2016-05-17 17:31:33 -07:00
|
|
|
|
gnh->options, &crit_opt);
|
|
|
|
|
|
|
|
|
|
gnh->opt_len = opt_len / 4;
|
2016-05-23 20:27:14 -07:00
|
|
|
|
gnh->oam = !!(params->flow->tunnel.flags & FLOW_TNL_F_OAM);
|
2016-05-17 17:31:33 -07:00
|
|
|
|
gnh->critical = crit_opt ? 1 : 0;
|
|
|
|
|
gnh->proto_type = htons(ETH_TYPE_TEB);
|
|
|
|
|
|
2016-05-23 20:27:14 -07:00
|
|
|
|
data->header_len += sizeof *gnh + opt_len;
|
2016-05-17 17:31:33 -07:00
|
|
|
|
data->tnl_type = OVS_VPORT_TYPE_GENEVE;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
netdev_tnl_egress_port_range(struct unixctl_conn *conn, int argc,
|
|
|
|
|
const char *argv[], void *aux OVS_UNUSED)
|
|
|
|
|
{
|
|
|
|
|
int val1, val2;
|
|
|
|
|
|
|
|
|
|
if (argc < 3) {
|
|
|
|
|
struct ds ds = DS_EMPTY_INITIALIZER;
|
|
|
|
|
|
|
|
|
|
ds_put_format(&ds, "Tunnel UDP source port range: %"PRIu16"-%"PRIu16"\n",
|
|
|
|
|
tnl_udp_port_min, tnl_udp_port_max);
|
|
|
|
|
|
|
|
|
|
unixctl_command_reply(conn, ds_cstr(&ds));
|
|
|
|
|
ds_destroy(&ds);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (argc != 3) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
val1 = atoi(argv[1]);
|
|
|
|
|
if (val1 <= 0 || val1 > UINT16_MAX) {
|
|
|
|
|
unixctl_command_reply(conn, "Invalid min.");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
val2 = atoi(argv[2]);
|
|
|
|
|
if (val2 <= 0 || val2 > UINT16_MAX) {
|
|
|
|
|
unixctl_command_reply(conn, "Invalid max.");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (val1 > val2) {
|
|
|
|
|
tnl_udp_port_min = val2;
|
|
|
|
|
tnl_udp_port_max = val1;
|
|
|
|
|
} else {
|
|
|
|
|
tnl_udp_port_min = val1;
|
|
|
|
|
tnl_udp_port_max = val2;
|
|
|
|
|
}
|
|
|
|
|
seq_change(tnl_conf_seq);
|
|
|
|
|
|
|
|
|
|
unixctl_command_reply(conn, "OK");
|
|
|
|
|
}
|