2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00
ovs/lib/conntrack-tcp.c
Ben Pfaff 3eec7fb075 pcap-file: Fix calculation of TCP payload length in tcp_reader_run().
The calculation in tcp_reader_run() failed to account for L2 padding.
This fixes the problem, by moving the existing function
tcp_payload_length() from a conntrack private header file into
dp-packet.h and renaming it to suit the dp_packet style.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ilya Maximets <i.maximets@ovn.org>
2021-02-02 09:59:31 -08:00

521 lines
17 KiB
C

/*-
* Copyright (c) 2001 Daniel Hartmeier
* Copyright (c) 2002 - 2008 Henning Brauer
* Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
* Copyright (c) 2015, 2016 Nicira, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Effort sponsored in part by the Defense Advanced Research Projects
* Agency (DARPA) and Air Force Research Laboratory, Air Force
* Materiel Command, USAF, under agreement number F30602-01-2-0537.
*
* $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
*/
#include <config.h>
#include "conntrack-private.h"
#include "conntrack-tp.h"
#include "coverage.h"
#include "ct-dpif.h"
#include "dp-packet.h"
#include "util.h"
COVERAGE_DEFINE(conntrack_tcp_seq_chk_bypass);
COVERAGE_DEFINE(conntrack_tcp_seq_chk_failed);
COVERAGE_DEFINE(conntrack_invalid_tcp_flags);
struct tcp_peer {
uint32_t seqlo; /* Max sequence number sent */
uint32_t seqhi; /* Max the other end ACKd + win */
uint16_t max_win; /* largest window (pre scaling) */
uint8_t wscale; /* window scaling factor */
enum ct_dpif_tcp_state state;
};
struct conn_tcp {
struct conn up;
struct tcp_peer peer[2]; /* 'conn' lock protected. */
};
enum {
TCPOPT_EOL,
TCPOPT_NOP,
TCPOPT_WINDOW = 3,
};
/* TCP sequence numbers are 32 bit integers operated
* on with modular arithmetic. These macros can be
* used to compare such integers. */
#define SEQ_LT(a,b) INT_MOD_LT(a, b)
#define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b)
#define SEQ_GT(a,b) INT_MOD_GT(a, b)
#define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b)
#define SEQ_MIN(a, b) INT_MOD_MIN(a, b)
#define SEQ_MAX(a, b) INT_MOD_MAX(a, b)
static struct conn_tcp*
conn_tcp_cast(const struct conn* conn)
{
return CONTAINER_OF(conn, struct conn_tcp, up);
}
/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
* is enabled. We're not scrubbing, but this check seems reasonable. */
static bool
tcp_invalid_flags(uint16_t flags)
{
if (flags & TCP_SYN) {
if (flags & TCP_RST || flags & TCP_FIN) {
return true;
}
} else {
/* Illegal packet */
if (!(flags & (TCP_ACK|TCP_RST))) {
return true;
}
}
if (!(flags & TCP_ACK)) {
/* These flags are only valid if ACK is set */
if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
return true;
}
}
return false;
}
#define TCP_MAX_WSCALE 14
#define CT_WSCALE_FLAG 0x80
#define CT_WSCALE_UNKNOWN 0x40
#define CT_WSCALE_MASK 0xf
static uint8_t
tcp_get_wscale(const struct tcp_header *tcp)
{
int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
const uint8_t *opt = (const uint8_t *)(tcp + 1);
uint8_t wscale = 0;
uint8_t optlen;
while (len >= 3) {
switch (*opt) {
case TCPOPT_EOL:
return wscale;
case TCPOPT_NOP:
opt++;
len--;
break;
case TCPOPT_WINDOW:
wscale = MIN(opt[2], TCP_MAX_WSCALE);
wscale |= CT_WSCALE_FLAG;
/* fall through */
default:
optlen = opt[1];
if (optlen < 2) {
optlen = 2;
}
len -= optlen;
opt += optlen;
}
}
return wscale;
}
static bool
tcp_bypass_seq_chk(struct conntrack *ct)
{
if (!conntrack_get_tcp_seq_chk(ct)) {
COVERAGE_INC(conntrack_tcp_seq_chk_bypass);
return true;
}
return false;
}
static enum ct_update_res
tcp_conn_update(struct conntrack *ct, struct conn *conn_,
struct dp_packet *pkt, bool reply, long long now)
{
struct conn_tcp *conn = conn_tcp_cast(conn_);
struct tcp_header *tcp = dp_packet_l4(pkt);
/* The peer that sent 'pkt' */
struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
/* The peer that should receive 'pkt' */
struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
uint8_t sws = 0, dws = 0;
uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
uint16_t win = ntohs(tcp->tcp_winsz);
uint32_t ack, end, seq, orig_seq;
uint32_t p_len = dp_packet_get_tcp_payload_length(pkt);
if (tcp_invalid_flags(tcp_flags)) {
COVERAGE_INC(conntrack_invalid_tcp_flags);
return CT_UPDATE_INVALID;
}
if ((tcp_flags & (TCP_SYN | TCP_ACK)) == TCP_SYN) {
if (dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
&& src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
src->state = dst->state = CT_DPIF_TCPS_CLOSED;
return CT_UPDATE_NEW;
} else if (src->state <= CT_DPIF_TCPS_SYN_SENT) {
src->state = CT_DPIF_TCPS_SYN_SENT;
conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIRST_PACKET, now);
return CT_UPDATE_VALID_NEW;
}
}
if (src->wscale & CT_WSCALE_FLAG
&& dst->wscale & CT_WSCALE_FLAG
&& !(tcp_flags & TCP_SYN)) {
sws = src->wscale & CT_WSCALE_MASK;
dws = dst->wscale & CT_WSCALE_MASK;
} else if (src->wscale & CT_WSCALE_UNKNOWN
&& dst->wscale & CT_WSCALE_UNKNOWN
&& !(tcp_flags & TCP_SYN)) {
sws = TCP_MAX_WSCALE;
dws = TCP_MAX_WSCALE;
}
/*
* Sequence tracking algorithm from Guido van Rooij's paper:
* http://www.madison-gurkha.com/publications/tcp_filtering/
* tcp_filtering.ps
*/
orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
bool check_ackskew = true;
if (src->state < CT_DPIF_TCPS_SYN_SENT) {
/* First packet from this end. Set its state */
ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
end = seq + p_len;
if (tcp_flags & TCP_SYN) {
end++;
if (dst->wscale & CT_WSCALE_FLAG) {
src->wscale = tcp_get_wscale(tcp);
if (src->wscale & CT_WSCALE_FLAG) {
/* Remove scale factor from initial window */
sws = src->wscale & CT_WSCALE_MASK;
win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
dws = dst->wscale & CT_WSCALE_MASK;
} else {
/* fixup other window */
dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
/* in case of a retrans SYN|ACK */
dst->wscale = 0;
}
}
}
if (tcp_flags & TCP_FIN) {
end++;
}
src->seqlo = seq;
src->state = CT_DPIF_TCPS_SYN_SENT;
/*
* May need to slide the window (seqhi may have been set by
* the crappy stack check or if we picked up the connection
* after establishment)
*/
if (src->seqhi == 1
|| SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
src->seqhi = end + MAX(1, dst->max_win << dws);
/* We are either picking up a new connection or a connection which
* was already in place. We are more permissive in terms of
* ackskew checking in these cases.
*/
check_ackskew = false;
}
if (win > src->max_win) {
src->max_win = win;
}
} else {
ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
end = seq + p_len;
if (tcp_flags & TCP_SYN) {
end++;
}
if (tcp_flags & TCP_FIN) {
end++;
}
}
if ((tcp_flags & TCP_ACK) == 0) {
/* Let it pass through the ack skew check */
ack = dst->seqlo;
} else if ((ack == 0
&& (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
/* broken tcp stacks do not set ack */) {
/* Many stacks (ours included) will set the ACK number in an
* FIN|ACK if the SYN times out -- no sequence to ACK. */
ack = dst->seqlo;
}
if (seq == end) {
/* Ease sequencing restrictions on no data packets */
seq = src->seqlo;
end = seq;
}
int ackskew = check_ackskew ? dst->seqlo - ack : 0;
#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
if ((SEQ_GEQ(src->seqhi, end)
/* Last octet inside other's window space */
&& SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
/* Retrans: not more than one window back */
&& (ackskew >= -MAXACKWINDOW)
/* Acking not more than one reassembled fragment backwards */
&& (ackskew <= (MAXACKWINDOW << sws))
/* Acking not more than one window forward */
&& ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
|| (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo)))
|| tcp_bypass_seq_chk(ct)) {
/* Require an exact/+1 sequence match on resets when possible */
/* update max window */
if (src->max_win < win) {
src->max_win = win;
}
/* synchronize sequencing */
if (SEQ_GT(end, src->seqlo)) {
src->seqlo = end;
}
/* slide the window of what the other end can send */
if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
dst->seqhi = ack + MAX((win << sws), 1);
}
/* update states */
if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
src->state = CT_DPIF_TCPS_SYN_SENT;
}
if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
src->state = CT_DPIF_TCPS_CLOSING;
}
if (tcp_flags & TCP_ACK) {
if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
dst->state = CT_DPIF_TCPS_ESTABLISHED;
} else if (dst->state == CT_DPIF_TCPS_CLOSING) {
dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
}
}
if (tcp_flags & TCP_RST) {
src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
}
if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
&& dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSED, now);
} else if (src->state >= CT_DPIF_TCPS_CLOSING
&& dst->state >= CT_DPIF_TCPS_CLOSING) {
conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIN_WAIT, now);
} else if (src->state < CT_DPIF_TCPS_ESTABLISHED
|| dst->state < CT_DPIF_TCPS_ESTABLISHED) {
conn_update_expiration(ct, &conn->up, CT_TM_TCP_OPENING, now);
} else if (src->state >= CT_DPIF_TCPS_CLOSING
|| dst->state >= CT_DPIF_TCPS_CLOSING) {
conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSING, now);
} else {
conn_update_expiration(ct, &conn->up, CT_TM_TCP_ESTABLISHED, now);
}
} else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
|| dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
|| src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
&& SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
/* Within a window forward of the originating packet */
&& SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
/* Within a window backward of the originating packet */
/*
* This currently handles three situations:
* 1) Stupid stacks will shotgun SYNs before their peer
* replies.
* 2) When PF catches an already established stream (the
* firewall rebooted, the state table was flushed, routes
* changed...)
* 3) Packets get funky immediately after the connection
* closes (this should catch Solaris spurious ACK|FINs
* that web servers like to spew after a close)
*
* This must be a little more careful than the above code
* since packet floods will also be caught here. We don't
* update the TTL here to mitigate the damage of a packet
* flood and so the same code can handle awkward establishment
* and a loosened connection close.
* In the establishment case, a correct peer response will
* validate the connection, go through the normal state code
* and keep updating the state TTL.
*/
/* update max window */
if (src->max_win < win) {
src->max_win = win;
}
/* synchronize sequencing */
if (SEQ_GT(end, src->seqlo)) {
src->seqlo = end;
}
/* slide the window of what the other end can send */
if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
dst->seqhi = ack + MAX((win << sws), 1);
}
/*
* Cannot set dst->seqhi here since this could be a shotgunned
* SYN and not an already established connection.
*/
if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
src->state = CT_DPIF_TCPS_CLOSING;
}
if (tcp_flags & TCP_RST) {
src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
}
} else {
COVERAGE_INC(conntrack_tcp_seq_chk_failed);
return CT_UPDATE_INVALID;
}
return CT_UPDATE_VALID;
}
static bool
tcp_valid_new(struct dp_packet *pkt)
{
struct tcp_header *tcp = dp_packet_l4(pkt);
uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
if (tcp_invalid_flags(tcp_flags)) {
return false;
}
/* A syn+ack is not allowed to create a connection. We want to allow
* totally new connections (syn) or already established, not partially
* open (syn+ack). */
if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
return false;
}
return true;
}
static struct conn *
tcp_new_conn(struct conntrack *ct, struct dp_packet *pkt, long long now,
uint32_t tp_id)
{
struct conn_tcp* newconn = NULL;
struct tcp_header *tcp = dp_packet_l4(pkt);
struct tcp_peer *src, *dst;
uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
newconn = xzalloc(sizeof *newconn);
src = &newconn->peer[0];
dst = &newconn->peer[1];
src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
src->seqhi = src->seqlo + dp_packet_get_tcp_payload_length(pkt) + 1;
if (tcp_flags & TCP_SYN) {
src->seqhi++;
src->wscale = tcp_get_wscale(tcp);
} else {
src->wscale = CT_WSCALE_UNKNOWN;
dst->wscale = CT_WSCALE_UNKNOWN;
}
src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
if (src->wscale & CT_WSCALE_MASK) {
/* Remove scale factor from initial window */
uint8_t sws = src->wscale & CT_WSCALE_MASK;
src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
}
if (tcp_flags & TCP_FIN) {
src->seqhi++;
}
dst->seqhi = 1;
dst->max_win = 1;
src->state = CT_DPIF_TCPS_SYN_SENT;
dst->state = CT_DPIF_TCPS_CLOSED;
newconn->up.tp_id = tp_id;
conn_init_expiration(ct, &newconn->up, CT_TM_TCP_FIRST_PACKET, now);
return &newconn->up;
}
static uint8_t
tcp_peer_to_protoinfo_flags(const struct tcp_peer *peer)
{
uint8_t res = 0;
if (peer->wscale & CT_WSCALE_FLAG) {
res |= CT_DPIF_TCPF_WINDOW_SCALE;
}
if (peer->wscale & CT_WSCALE_UNKNOWN) {
res |= CT_DPIF_TCPF_BE_LIBERAL;
}
return res;
}
static void
tcp_conn_get_protoinfo(const struct conn *conn_,
struct ct_dpif_protoinfo *protoinfo)
{
const struct conn_tcp *conn = conn_tcp_cast(conn_);
protoinfo->proto = IPPROTO_TCP;
protoinfo->tcp.state_orig = conn->peer[0].state;
protoinfo->tcp.state_reply = conn->peer[1].state;
protoinfo->tcp.wscale_orig = conn->peer[0].wscale & CT_WSCALE_MASK;
protoinfo->tcp.wscale_reply = conn->peer[1].wscale & CT_WSCALE_MASK;
protoinfo->tcp.flags_orig = tcp_peer_to_protoinfo_flags(&conn->peer[0]);
protoinfo->tcp.flags_reply = tcp_peer_to_protoinfo_flags(&conn->peer[1]);
}
struct ct_l4_proto ct_proto_tcp = {
.new_conn = tcp_new_conn,
.valid_new = tcp_valid_new,
.conn_update = tcp_conn_update,
.conn_get_protoinfo = tcp_conn_get_protoinfo,
};