2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 01:51:26 +00:00

conntrack: Handle SNAT with all-zero IP address.

This patch introduces for the userspace datapath the handling
of rules like the following:

  ct(commit,nat(src=0.0.0.0),...)

Kernel datapath already handle this case that is particularly
handy in scenarios like the following:

Given A: 10.1.1.1, B: 192.168.2.100, C: 10.1.1.2

A opens a connection toward B on port 80 selecting as source port 10000.
B's IP gets dnat'ed to C's IP (10.1.1.1:10000 -> 192.168.2.100:80).

This will result in:

  tcp,orig=(src=10.1.1.1,dst=192.168.2.100,sport=10000,dport=80),
     reply=(src=10.1.1.2,dst=10.1.1.1,sport=80,dport=10000),
     protoinfo=(state=ESTABLISHED)

A now tries to establish another connection with C using source port
10000, this time using C's IP address (10.1.1.1:10000 -> 10.1.1.2:80).

This second connection, if processed by conntrack with no SNAT/DNAT
involved, collides with the reverse tuple of the first connection,
so the entry for this valid connection doesn't get created.

With this commit, and adding a SNAT rule with 0.0.0.0 for
10.1.1.1:10000 -> 10.1.1.2:80 will allow to create the conn entry:

  tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=10000,dport=80),
     reply=(src=10.1.1.2,dst=10.1.1.1,sport=80,dport=10001),
     protoinfo=(state=ESTABLISHED)
  tcp,orig=(src=10.1.1.1,dst=192.168.2.100,sport=10000,dport=80),
     reply=(src=10.1.1.2,dst=10.1.1.1,sport=80,dport=10000),
     protoinfo=(state=ESTABLISHED)

The issue exists even in the opposite case (with A trying to connect
to C using B's IP after establishing a direct connection from A to C).

This commit refactors the relevant function in a way that both of the
previously mentioned cases are handled as well.

Suggested-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Paolo Valerio <pvalerio@redhat.com>
Acked-by: Gaetan Rivet <grive@u256.net>
Acked-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
This commit is contained in:
Paolo Valerio 2021-07-06 15:03:18 +02:00 committed by Ilya Maximets
parent fa0e2d26df
commit 61e48c2d1d
5 changed files with 246 additions and 129 deletions

3
NEWS
View File

@ -17,6 +17,9 @@ Post-v2.15.0
cases, e.g if all PMD threads are running on the same NUMA node.
* Userspace datapath now supports up to 2^18 meters.
* Added support for systems with non-contiguous NUMA nodes and core ids.
* Added all-zero IP SNAT handling to conntrack. In case of collision,
using ct(src=0.0.0.0), the source port will be replaced with another
non-colliding port in the ephemeral range (1024, 65535).
- ovs-ctl:
* New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.

View File

@ -148,6 +148,39 @@ enum ct_update_res {
CT_TIMEOUT(ICMP_FIRST) \
CT_TIMEOUT(ICMP_REPLY)
#define NAT_ACTION_SNAT_ALL (NAT_ACTION_SRC | NAT_ACTION_SRC_PORT)
#define NAT_ACTION_DNAT_ALL (NAT_ACTION_DST | NAT_ACTION_DST_PORT)
enum ct_ephemeral_range {
MIN_NAT_EPHEMERAL_PORT = 1024,
MAX_NAT_EPHEMERAL_PORT = 65535
};
#define IN_RANGE(curr, min, max) \
(curr >= min && curr <= max)
#define NEXT_PORT_IN_RANGE(curr, min, max) \
(curr = (!IN_RANGE(curr, min, max) || curr == max) ? min : curr + 1)
/* If the current port is out of range increase the attempts by
* one so that in the worst case scenario the current out of
* range port plus all the in-range ports get tested.
* Note that curr can be an out of range port only in case of
* source port (SNAT with port range unspecified or DNAT),
* furthermore the source port in the packet has to be less than
* MIN_NAT_EPHEMERAL_PORT. */
#define N_PORT_ATTEMPTS(curr, min, max) \
((!IN_RANGE(curr, min, max)) ? (max - min) + 2 : (max - min) + 1)
/* Loose in-range check, the first curr port can be any port out of
* the range. */
#define FOR_EACH_PORT_IN_RANGE__(curr, min, max, INAME) \
for (uint16_t INAME = N_PORT_ATTEMPTS(curr, min, max); \
INAME > 0; INAME--, NEXT_PORT_IN_RANGE(curr, min, max))
#define FOR_EACH_PORT_IN_RANGE(curr, min, max) \
FOR_EACH_PORT_IN_RANGE__(curr, min, max, OVS_JOIN(idx, __COUNTER__))
enum ct_timeout {
#define CT_TIMEOUT(NAME) CT_TM_##NAME,
CT_TIMEOUTS

View File

@ -110,8 +110,8 @@ static void set_label(struct dp_packet *, struct conn *,
static void *clean_thread_main(void *f_);
static bool
nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
struct conn *nat_conn);
nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
struct conn *nat_conn);
static uint8_t
reverse_icmp_type(uint8_t type);
@ -730,11 +730,11 @@ pat_packet(struct dp_packet *pkt, const struct conn *conn)
}
} else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
if (conn->key.nw_proto == IPPROTO_TCP) {
struct tcp_header *th = dp_packet_l4(pkt);
packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
packet_set_tcp_port(pkt, conn->rev_key.dst.port,
conn->rev_key.src.port);
} else if (conn->key.nw_proto == IPPROTO_UDP) {
struct udp_header *uh = dp_packet_l4(pkt);
packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
packet_set_udp_port(pkt, conn->rev_key.dst.port,
conn->rev_key.src.port);
}
}
}
@ -788,11 +788,9 @@ un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
}
} else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
if (conn->key.nw_proto == IPPROTO_TCP) {
struct tcp_header *th = dp_packet_l4(pkt);
packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
packet_set_tcp_port(pkt, conn->key.dst.port, conn->key.src.port);
} else if (conn->key.nw_proto == IPPROTO_UDP) {
struct udp_header *uh = dp_packet_l4(pkt);
packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
packet_set_udp_port(pkt, conn->key.dst.port, conn->key.src.port);
}
}
}
@ -812,12 +810,10 @@ reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
}
} else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
if (conn->key.nw_proto == IPPROTO_TCP) {
struct tcp_header *th_in = dp_packet_l4(pkt);
packet_set_tcp_port(pkt, th_in->tcp_src,
packet_set_tcp_port(pkt, conn->key.src.port,
conn->key.dst.port);
} else if (conn->key.nw_proto == IPPROTO_UDP) {
struct udp_header *uh_in = dp_packet_l4(pkt);
packet_set_udp_port(pkt, uh_in->udp_src,
packet_set_udp_port(pkt, conn->key.src.port,
conn->key.dst.port);
}
}
@ -1031,14 +1027,14 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
}
} else {
memcpy(nat_conn, nc, sizeof *nat_conn);
bool nat_res = nat_select_range_tuple(ct, nc, nat_conn);
bool nat_res = nat_get_unique_tuple(ct, nc, nat_conn);
if (!nat_res) {
goto nat_res_exhaustion;
}
/* Update nc with nat adjustments made to nat_conn by
* nat_select_range_tuple(). */
* nat_get_unique_tuple(). */
memcpy(nc, nat_conn, sizeof *nc);
}
@ -2252,130 +2248,218 @@ nat_range_hash(const struct conn *conn, uint32_t basis)
return hash_finish(hash, 0);
}
static bool
nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
struct conn *nat_conn)
/* Ports are stored in host byte order for convenience. */
static void
set_sport_range(struct nat_action_info_t *ni, const struct conn_key *k,
uint32_t hash, uint16_t *curr, uint16_t *min,
uint16_t *max)
{
enum { MIN_NAT_EPHEMERAL_PORT = 1024,
MAX_NAT_EPHEMERAL_PORT = 65535 };
uint16_t min_port;
uint16_t max_port;
uint16_t first_port;
uint32_t hash = nat_range_hash(conn, ct->hash_basis);
if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
(!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
min_port = ntohs(conn->key.src.port);
max_port = ntohs(conn->key.src.port);
first_port = min_port;
} else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
(!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
min_port = ntohs(conn->key.dst.port);
max_port = ntohs(conn->key.dst.port);
first_port = min_port;
if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) ||
((ni->nat_action & NAT_ACTION_DST))) {
*curr = ntohs(k->src.port);
*min = MIN_NAT_EPHEMERAL_PORT;
*max = MAX_NAT_EPHEMERAL_PORT;
} else {
uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
uint32_t port_index = hash % (deltap + 1);
first_port = conn->nat_info->min_port + port_index;
min_port = conn->nat_info->min_port;
max_port = conn->nat_info->max_port;
*min = ni->min_port;
*max = ni->max_port;
*curr = *min + (hash % ((*max - *min) + 1));
}
}
uint32_t deltaa = 0;
uint32_t address_index;
union ct_addr ct_addr;
memset(&ct_addr, 0, sizeof ct_addr);
union ct_addr max_ct_addr;
memset(&max_ct_addr, 0, sizeof max_ct_addr);
max_ct_addr = conn->nat_info->max_addr;
if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
deltaa = ntohl(conn->nat_info->max_addr.ipv4) -
ntohl(conn->nat_info->min_addr.ipv4);
address_index = hash % (deltaa + 1);
ct_addr.ipv4 = htonl(
ntohl(conn->nat_info->min_addr.ipv4) + address_index);
static void
set_dport_range(struct nat_action_info_t *ni, const struct conn_key *k,
uint32_t hash, uint16_t *curr, uint16_t *min,
uint16_t *max)
{
if (ni->nat_action & NAT_ACTION_DST_PORT) {
*min = ni->min_port;
*max = ni->max_port;
*curr = *min + (hash % ((*max - *min) + 1));
} else {
deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6,
&conn->nat_info->max_addr.ipv6);
/* deltaa must be within 32 bits for full hash coverage. A 64 or
*curr = ntohs(k->dst.port);
*min = *max = *curr;
}
}
/* Gets the initial in range address based on the hash.
* Addresses are kept in network order. */
static void
get_addr_in_range(union ct_addr *min, union ct_addr *max,
union ct_addr *curr, uint32_t hash, bool ipv4)
{
uint32_t offt, range;
if (ipv4) {
range = (ntohl(max->ipv4) - ntohl(min->ipv4)) + 1;
offt = hash % range;
curr->ipv4 = htonl(ntohl(min->ipv4) + offt);
} else {
range = nat_ipv6_addrs_delta(&min->ipv6, &max->ipv6) + 1;
/* Range must be within 32 bits for full hash coverage. A 64 or
* 128 bit hash is unnecessary and hence not used here. Most code
* is kept common with V4; nat_ipv6_addrs_delta() will do the
* enforcement via max_ct_addr. */
max_ct_addr = conn->nat_info->min_addr;
nat_ipv6_addr_increment(&max_ct_addr.ipv6, deltaa);
address_index = hash % (deltaa + 1);
ct_addr.ipv6 = conn->nat_info->min_addr.ipv6;
nat_ipv6_addr_increment(&ct_addr.ipv6, address_index);
offt = hash % range;
curr->ipv6 = min->ipv6;
nat_ipv6_addr_increment(&curr->ipv6, offt);
}
}
uint16_t port = first_port;
bool all_ports_tried = false;
/* For DNAT or for specified port ranges, we don't use ephemeral ports. */
bool ephemeral_ports_tried
= conn->nat_info->nat_action & NAT_ACTION_DST ||
conn->nat_info->nat_action & NAT_ACTION_SRC_PORT
? true : false;
union ct_addr first_addr = ct_addr;
bool pat_enabled = conn->key.nw_proto == IPPROTO_TCP ||
conn->key.nw_proto == IPPROTO_UDP;
static void
get_initial_addr(const struct conn *conn, union ct_addr *min,
union ct_addr *max, union ct_addr *curr,
uint32_t hash, bool ipv4)
{
const union ct_addr zero_ip = {0};
while (true) {
/* All-zero case. */
if (!memcmp(min, &zero_ip, sizeof *min)) {
if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
nat_conn->rev_key.dst.addr = ct_addr;
if (pat_enabled) {
nat_conn->rev_key.dst.port = htons(port);
}
} else {
nat_conn->rev_key.src.addr = ct_addr;
if (pat_enabled) {
nat_conn->rev_key.src.port = htons(port);
}
*curr = conn->key.src.addr;
} else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
*curr = conn->key.dst.addr;
}
} else {
get_addr_in_range(min, max, curr, hash, ipv4);
}
}
static void
store_addr_to_key(union ct_addr *addr, struct conn_key *key,
uint16_t action)
{
if (action & NAT_ACTION_SRC) {
key->dst.addr = *addr;
} else {
key->src.addr = *addr;
}
}
static void
next_addr_in_range(union ct_addr *curr, union ct_addr *min,
union ct_addr *max, bool ipv4)
{
if (ipv4) {
/* This check could be unified with IPv6, but let's avoid
* an unneeded memcmp() in case of IPv4. */
if (min->ipv4 == max->ipv4) {
return;
}
bool found = conn_lookup(ct, &nat_conn->rev_key, time_msec(), NULL,
NULL);
if (!found) {
curr->ipv4 = (curr->ipv4 == max->ipv4) ? min->ipv4
: htonl(ntohl(curr->ipv4) + 1);
} else {
if (!memcmp(min, max, sizeof *min)) {
return;
}
if (!memcmp(curr, max, sizeof *curr)) {
*curr = *min;
return;
}
nat_ipv6_addr_increment(&curr->ipv6, 1);
}
}
static bool
next_addr_in_range_guarded(union ct_addr *curr, union ct_addr *min,
union ct_addr *max, union ct_addr *guard,
bool ipv4)
{
bool exhausted;
next_addr_in_range(curr, min, max, ipv4);
if (ipv4) {
exhausted = (curr->ipv4 == guard->ipv4);
} else {
exhausted = !memcmp(curr, guard, sizeof *curr);
}
return exhausted;
}
/* This function tries to get a unique tuple.
* Every iteration checks that the reverse tuple doesn't
* collide with any existing one.
*
* In case of SNAT:
* - For each src IP address in the range (if any).
* - Try to find a source port in range (if any).
* - If no port range exists, use the whole
* ephemeral range (after testing the port
* used by the sender), otherwise use the
* specified range.
*
* In case of DNAT:
* - For each dst IP address in the range (if any).
* - For each dport in range (if any).
* - Try to find a source port in the ephemeral range
* (after testing the port used by the sender).
*
* If none can be found, return exhaustion to the caller. */
static bool
nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
struct conn *nat_conn)
{
union ct_addr min_addr = {0}, max_addr = {0}, curr_addr = {0},
guard_addr = {0};
uint32_t hash = nat_range_hash(conn, ct->hash_basis);
bool pat_proto = conn->key.nw_proto == IPPROTO_TCP ||
conn->key.nw_proto == IPPROTO_UDP;
uint16_t min_dport, max_dport, curr_dport;
uint16_t min_sport, max_sport, curr_sport;
min_addr = conn->nat_info->min_addr;
max_addr = conn->nat_info->max_addr;
get_initial_addr(conn, &min_addr, &max_addr, &curr_addr, hash,
(conn->key.dl_type == htons(ETH_TYPE_IP)));
/* Save the address we started from so that
* we can stop once we reach it. */
guard_addr = curr_addr;
set_sport_range(conn->nat_info, &conn->key, hash, &curr_sport,
&min_sport, &max_sport);
set_dport_range(conn->nat_info, &conn->key, hash, &curr_dport,
&min_dport, &max_dport);
another_round:
store_addr_to_key(&curr_addr, &nat_conn->rev_key,
conn->nat_info->nat_action);
if (!pat_proto) {
if (!conn_lookup(ct, &nat_conn->rev_key,
time_msec(), NULL, NULL)) {
return true;
} else if (pat_enabled && !all_ports_tried) {
if (min_port == max_port) {
all_ports_tried = true;
} else if (port == max_port) {
port = min_port;
} else {
port++;
}
goto next_addr;
}
FOR_EACH_PORT_IN_RANGE(curr_dport, min_dport, max_dport) {
nat_conn->rev_key.src.port = htons(curr_dport);
FOR_EACH_PORT_IN_RANGE(curr_sport, min_sport, max_sport) {
nat_conn->rev_key.dst.port = htons(curr_sport);
if (!conn_lookup(ct, &nat_conn->rev_key,
time_msec(), NULL, NULL)) {
return true;
}
if (port == first_port) {
all_ports_tried = true;
}
} else {
if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
ct_addr.ipv4 = htonl(ntohl(ct_addr.ipv4) + 1);
} else {
nat_ipv6_addr_increment(&ct_addr.ipv6, 1);
}
} else {
ct_addr = conn->nat_info->min_addr;
}
if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
if (pat_enabled && !ephemeral_ports_tried) {
ephemeral_ports_tried = true;
ct_addr = conn->nat_info->min_addr;
first_addr = ct_addr;
min_port = MIN_NAT_EPHEMERAL_PORT;
max_port = MAX_NAT_EPHEMERAL_PORT;
} else {
break;
}
}
first_port = min_port;
port = first_port;
all_ports_tried = false;
}
}
return false;
/* Check if next IP is in range and respin. Otherwise, notify
* exhaustion to the caller. */
next_addr:
if (next_addr_in_range_guarded(&curr_addr, &min_addr,
&max_addr, &guard_addr,
conn->key.dl_type == htons(ETH_TYPE_IP))) {
return false;
}
goto another_round;
}
static enum ct_update_res

View File

@ -2138,8 +2138,7 @@ for <var>i</var> in [1,<var>n_members</var>]:
<code>nat(src=0.0.0.0)</code>. In this case, when a source port
collision is detected during the commit, the source port will be
translated to an ephemeral port. If there is no collision, no SNAT
is performed. Note that this is currently only implemented in the
Linux kernel datapath.
is performed.
</p>
<p>

View File

@ -99,12 +99,10 @@ m4_define([CHECK_CONNTRACK_NAT])
# CHECK_CONNTRACK_ZEROIP_SNAT()
#
# Perform requirements checks for running conntrack all-zero IP SNAT tests.
# The userspace datapath does not support all-zero IP SNAT.
# The userspace datapath always supports all-zero IP SNAT, so no check is
# needed.
#
m4_define([CHECK_CONNTRACK_ZEROIP_SNAT],
[
AT_SKIP_IF([:])
])
m4_define([CHECK_CONNTRACK_ZEROIP_SNAT])
# CHECK_CONNTRACK_TIMEOUT()
#