2009-07-08 13:19:16 -07:00
|
|
|
/*
|
2014-07-10 13:21:35 -07:00
|
|
|
* Copyright (c) 2008, 2010, 2011, 2014 Nicira, Inc.
|
2009-07-08 13:19:16 -07:00
|
|
|
*
|
2009-06-15 15:11:30 -07:00
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at:
|
2009-07-08 13:19:16 -07:00
|
|
|
*
|
2009-06-15 15:11:30 -07:00
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
2009-07-08 13:19:16 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef NETLINK_PROTOCOL_H
|
|
|
|
#define NETLINK_PROTOCOL_H 1
|
|
|
|
|
|
|
|
/* Netlink protocol definitions.
|
|
|
|
*
|
2010-12-07 09:33:27 -08:00
|
|
|
* Netlink is a message framing format described in RFC 3549 and used heavily
|
|
|
|
* in Linux to access the network stack. Open vSwitch uses AF_NETLINK sockets
|
|
|
|
* for this purpose on Linux. But on all platforms, Open vSwitch uses Netlink
|
|
|
|
* message framing internally for certain purposes.
|
|
|
|
*
|
|
|
|
* This header provides access to the Netlink message framing definitions
|
|
|
|
* regardless of platform. On Linux, it includes the proper headers directly;
|
|
|
|
* on other platforms it directly defines the structures and macros itself.
|
|
|
|
*/
|
2009-07-08 13:19:16 -07:00
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include "util.h"
|
|
|
|
|
2010-12-07 09:33:27 -08:00
|
|
|
#ifdef HAVE_NETLINK
|
|
|
|
#include <linux/netlink.h>
|
|
|
|
#include <linux/genetlink.h>
|
2011-09-01 18:48:29 -07:00
|
|
|
|
2010-12-07 09:33:27 -08:00
|
|
|
#else
|
2016-07-11 14:59:50 -07:00
|
|
|
#define NETLINK_NETFILTER 12
|
2009-07-08 13:19:16 -07:00
|
|
|
#define NETLINK_GENERIC 16
|
|
|
|
|
|
|
|
/* nlmsg_flags bits. */
|
|
|
|
#define NLM_F_REQUEST 0x001
|
|
|
|
#define NLM_F_MULTI 0x002
|
|
|
|
#define NLM_F_ACK 0x004
|
|
|
|
#define NLM_F_ECHO 0x008
|
|
|
|
|
ct-dpif, dpif-netlink: Add conntrack timeout policy support
This patch first defines the dpif interface for a datapath to support
adding, deleting, getting and dumping conntrack timeout policy.
The timeout policy is identified by a 4 bytes unsigned integer in
datapath, and it currently support timeout for TCP, UDP, and ICMP
protocols.
Moreover, this patch provides the implementation for Linux kernel
datapath in dpif-netlink.
In Linux kernel, the timeout policy is maintained per L3/L4 protocol,
and it is identified by 32 bytes null terminated string. On the other
hand, in vswitchd, the timeout policy is a generic one that consists of
all the supported L4 protocols. Therefore, one of the main task in
dpif-netlink is to break down the generic timeout policy into 6
sub policies (ipv4 tcp, udp, icmp, and ipv6 tcp, udp, icmp),
and push down the configuration using the netlink API in
netlink-conntrack.c.
This patch also adds missing symbols in the windows datapath so
that the build on windows can pass.
Appveyor CI:
* https://ci.appveyor.com/project/YiHungWei/ovs/builds/26387754
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>
Signed-off-by: Justin Pettit <jpettit@ovn.org>
2019-08-28 15:14:24 -07:00
|
|
|
/* GET request flag.*/
|
2009-07-08 13:19:16 -07:00
|
|
|
#define NLM_F_ROOT 0x100
|
|
|
|
#define NLM_F_MATCH 0x200
|
|
|
|
#define NLM_F_ATOMIC 0x400
|
|
|
|
#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH)
|
|
|
|
|
ct-dpif, dpif-netlink: Add conntrack timeout policy support
This patch first defines the dpif interface for a datapath to support
adding, deleting, getting and dumping conntrack timeout policy.
The timeout policy is identified by a 4 bytes unsigned integer in
datapath, and it currently support timeout for TCP, UDP, and ICMP
protocols.
Moreover, this patch provides the implementation for Linux kernel
datapath in dpif-netlink.
In Linux kernel, the timeout policy is maintained per L3/L4 protocol,
and it is identified by 32 bytes null terminated string. On the other
hand, in vswitchd, the timeout policy is a generic one that consists of
all the supported L4 protocols. Therefore, one of the main task in
dpif-netlink is to break down the generic timeout policy into 6
sub policies (ipv4 tcp, udp, icmp, and ipv6 tcp, udp, icmp),
and push down the configuration using the netlink API in
netlink-conntrack.c.
This patch also adds missing symbols in the windows datapath so
that the build on windows can pass.
Appveyor CI:
* https://ci.appveyor.com/project/YiHungWei/ovs/builds/26387754
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>
Signed-off-by: Justin Pettit <jpettit@ovn.org>
2019-08-28 15:14:24 -07:00
|
|
|
/* NEW request flags. */
|
|
|
|
#define NLM_F_REPLACE 0x100
|
|
|
|
#define NLM_F_EXCL 0x200
|
|
|
|
#define NLM_F_CREATE 0x400
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
/* nlmsg_type values. */
|
|
|
|
#define NLMSG_NOOP 1
|
|
|
|
#define NLMSG_ERROR 2
|
|
|
|
#define NLMSG_DONE 3
|
|
|
|
#define NLMSG_OVERRUN 4
|
|
|
|
|
|
|
|
#define NLMSG_MIN_TYPE 0x10
|
|
|
|
|
2014-07-29 15:22:03 +00:00
|
|
|
#define MAX_LINKS 32
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
struct nlmsghdr {
|
|
|
|
uint32_t nlmsg_len;
|
|
|
|
uint16_t nlmsg_type;
|
|
|
|
uint16_t nlmsg_flags;
|
|
|
|
uint32_t nlmsg_seq;
|
|
|
|
uint32_t nlmsg_pid;
|
|
|
|
};
|
|
|
|
BUILD_ASSERT_DECL(sizeof(struct nlmsghdr) == 16);
|
|
|
|
|
|
|
|
#define NLMSG_ALIGNTO 4
|
|
|
|
#define NLMSG_ALIGN(SIZE) ROUND_UP(SIZE, NLMSG_ALIGNTO)
|
|
|
|
#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr)))
|
|
|
|
|
|
|
|
struct nlmsgerr
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct nlmsghdr msg;
|
|
|
|
};
|
|
|
|
BUILD_ASSERT_DECL(sizeof(struct nlmsgerr) == 20);
|
|
|
|
|
|
|
|
struct genlmsghdr {
|
|
|
|
uint8_t cmd;
|
|
|
|
uint8_t version;
|
|
|
|
uint16_t reserved;
|
|
|
|
};
|
|
|
|
BUILD_ASSERT_DECL(sizeof(struct genlmsghdr) == 4);
|
|
|
|
|
|
|
|
#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr))
|
|
|
|
|
|
|
|
struct nlattr {
|
|
|
|
uint16_t nla_len;
|
|
|
|
uint16_t nla_type;
|
|
|
|
};
|
|
|
|
BUILD_ASSERT_DECL(sizeof(struct nlattr) == 4);
|
|
|
|
|
|
|
|
#define NLA_ALIGNTO 4
|
|
|
|
#define NLA_ALIGN(SIZE) ROUND_UP(SIZE, NLA_ALIGNTO)
|
|
|
|
#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr)))
|
|
|
|
|
|
|
|
#define GENL_MIN_ID NLMSG_MIN_TYPE
|
|
|
|
#define GENL_MAX_ID 1023
|
|
|
|
|
|
|
|
#define GENL_ID_CTRL NLMSG_MIN_TYPE
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CTRL_CMD_UNSPEC,
|
|
|
|
CTRL_CMD_NEWFAMILY,
|
|
|
|
CTRL_CMD_DELFAMILY,
|
|
|
|
CTRL_CMD_GETFAMILY,
|
|
|
|
CTRL_CMD_NEWOPS,
|
|
|
|
CTRL_CMD_DELOPS,
|
|
|
|
CTRL_CMD_GETOPS,
|
|
|
|
__CTRL_CMD_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CTRL_ATTR_UNSPEC,
|
|
|
|
CTRL_ATTR_FAMILY_ID,
|
|
|
|
CTRL_ATTR_FAMILY_NAME,
|
|
|
|
CTRL_ATTR_VERSION,
|
|
|
|
CTRL_ATTR_HDRSIZE,
|
|
|
|
CTRL_ATTR_MAXATTR,
|
|
|
|
CTRL_ATTR_OPS,
|
|
|
|
__CTRL_ATTR_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CTRL_ATTR_OP_UNSPEC,
|
|
|
|
CTRL_ATTR_OP_ID,
|
|
|
|
CTRL_ATTR_OP_FLAGS,
|
|
|
|
__CTRL_ATTR_OP_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1)
|
2010-12-07 09:33:27 -08:00
|
|
|
#endif /* !HAVE_NETLINK */
|
2009-07-08 13:19:16 -07:00
|
|
|
|
2010-12-07 09:37:59 -08:00
|
|
|
/* These were introduced all together in 2.6.24. */
|
|
|
|
#ifndef NLA_TYPE_MASK
|
2012-03-23 11:43:54 -07:00
|
|
|
#define NLA_F_NESTED (1 << 15)
|
|
|
|
#define NLA_F_NET_BYTEORDER (1 << 14)
|
|
|
|
#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER)
|
2010-12-07 09:37:59 -08:00
|
|
|
#endif
|
|
|
|
|
route-table: Avoid routes from non-standard routing tables.
Currently, ovs-vswitchd is subscribed to all the routing changes in the
kernel. On each change, it marks the internal routing table cache as
invalid, then resets it and dumps all the routes from the kernel from
scratch. The reason for that is kernel routing updates not being
reliable in a sense that it's hard to tell which route is getting
removed or modified. Userspace application has to track the order in
which route entries are dumped from the kernel. Updates can get lost
or even duplicated and the kernel doesn't provide a good mechanism to
distinguish one route from another. To my knowledge, dumping all the
routes from a kernel after each change is the only way to keep the
cache consistent. Some more info can be found in the following never
addressed issues:
https://bugzilla.redhat.com/1337860
https://bugzilla.redhat.com/1337855
It seems to be believed that NetworkManager "mostly" does incremental
updates right. But it is still not completely correct, will re-dump
the whole table in certain cases, and it takes a huge amount of very
complicated code to do the accounting and route comparisons.
Going back to ovs-vswitchd, it currently dumps routes from all the
routing tables. If it will get conflicting routes from multiple
tables, the cache will not be useful. The routing cache in userspace
is primarily used for checking the egress port for tunneled traffic
and this way also detecting link state changes for a tunnel port.
For userspace datapath it is used for actual routing of the packet
after sending to a native tunnel.
With kernel datapath we don't really have a mechanism to know which
routing table will actually be used by the kernel after encapsulation,
so our lookups on a cache may be incorrect because of this as well.
So, unless all the relevant routes are in the standard tables, the
lookup in userspace route cache is unreliable.
Luckily, most setups are not using any complicated routing in
non-standard tables that OVS has to be aware of.
It is possible, but unlikely, that standard routing tables are
completely empty while some other custom table is not, and all the OVS
tunnel traffic is directed to that table. That would be the only
scenario where dumping non-standard tables would make sense. But it
seems like this kind of setup will likely need a way to tell OVS from
which table the routes should be taken, or we'll need to dump routing
rules and keep a separate cache for each table, so we can first match
on rules and then lookup correct routes in a specific table. I'm not
sure if trying to implement all that is justified.
For now, stop considering routes from non-standard tables to avoid
mixing different tables together and also wasting CPU resources.
This fixes a high CPU usage in ovs-vswitchd in case a BGP daemon is
running on a same host and in a same network namespace with OVS using
its own custom routing table.
Unfortunately, there seems to be no way to tell the kernel to send
updates only for particular tables. So, we'll still receive and parse
all of them. But they will not result in a full cache invalidation in
most cases.
Linux kernel v4.20 introduced filtering support for RTM_GETROUTE dumps.
So, we can make use of it and dump only standard tables when we get a
relevant route update. NETLINK_GET_STRICT_CHK has to be enabled on
the socket for filtering to work. There is no reason to not enable it
by default, if supported. It is not used outside of NETLINK_ROUTE.
Fixes: f0e167f0dbad ("route-table: Handle route updates more robustly.")
Fixes: ea83a2fcd0d3 ("lib: Show tunnel egress interface in ovsdb")
Reported-at: https://github.com/openvswitch/ovs-issues/issues/185
Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-October/052091.html
Acked-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-03-20 19:47:21 +01:00
|
|
|
/* Introduced in v4.4. */
|
|
|
|
#ifndef NLM_F_DUMP_FILTERED
|
|
|
|
#define NLM_F_DUMP_FILTERED 0x20
|
|
|
|
#endif
|
|
|
|
|
2011-01-09 16:57:45 -08:00
|
|
|
/* These were introduced all together in 2.6.14. (We want our programs to
|
|
|
|
* support the newer kernel features even if compiled with older headers.) */
|
|
|
|
#ifndef NETLINK_ADD_MEMBERSHIP
|
|
|
|
#define NETLINK_ADD_MEMBERSHIP 1
|
|
|
|
#define NETLINK_DROP_MEMBERSHIP 2
|
|
|
|
#endif
|
|
|
|
|
2018-03-29 23:05:29 -03:00
|
|
|
/* This was introduced in v4.2. (We want our programs to support the newer
|
|
|
|
* kernel features even if compiled with older headers.) */
|
|
|
|
#ifndef NETLINK_LISTEN_ALL_NSID
|
|
|
|
#define NETLINK_LISTEN_ALL_NSID 8
|
|
|
|
#endif
|
|
|
|
|
route-table: Avoid routes from non-standard routing tables.
Currently, ovs-vswitchd is subscribed to all the routing changes in the
kernel. On each change, it marks the internal routing table cache as
invalid, then resets it and dumps all the routes from the kernel from
scratch. The reason for that is kernel routing updates not being
reliable in a sense that it's hard to tell which route is getting
removed or modified. Userspace application has to track the order in
which route entries are dumped from the kernel. Updates can get lost
or even duplicated and the kernel doesn't provide a good mechanism to
distinguish one route from another. To my knowledge, dumping all the
routes from a kernel after each change is the only way to keep the
cache consistent. Some more info can be found in the following never
addressed issues:
https://bugzilla.redhat.com/1337860
https://bugzilla.redhat.com/1337855
It seems to be believed that NetworkManager "mostly" does incremental
updates right. But it is still not completely correct, will re-dump
the whole table in certain cases, and it takes a huge amount of very
complicated code to do the accounting and route comparisons.
Going back to ovs-vswitchd, it currently dumps routes from all the
routing tables. If it will get conflicting routes from multiple
tables, the cache will not be useful. The routing cache in userspace
is primarily used for checking the egress port for tunneled traffic
and this way also detecting link state changes for a tunnel port.
For userspace datapath it is used for actual routing of the packet
after sending to a native tunnel.
With kernel datapath we don't really have a mechanism to know which
routing table will actually be used by the kernel after encapsulation,
so our lookups on a cache may be incorrect because of this as well.
So, unless all the relevant routes are in the standard tables, the
lookup in userspace route cache is unreliable.
Luckily, most setups are not using any complicated routing in
non-standard tables that OVS has to be aware of.
It is possible, but unlikely, that standard routing tables are
completely empty while some other custom table is not, and all the OVS
tunnel traffic is directed to that table. That would be the only
scenario where dumping non-standard tables would make sense. But it
seems like this kind of setup will likely need a way to tell OVS from
which table the routes should be taken, or we'll need to dump routing
rules and keep a separate cache for each table, so we can first match
on rules and then lookup correct routes in a specific table. I'm not
sure if trying to implement all that is justified.
For now, stop considering routes from non-standard tables to avoid
mixing different tables together and also wasting CPU resources.
This fixes a high CPU usage in ovs-vswitchd in case a BGP daemon is
running on a same host and in a same network namespace with OVS using
its own custom routing table.
Unfortunately, there seems to be no way to tell the kernel to send
updates only for particular tables. So, we'll still receive and parse
all of them. But they will not result in a full cache invalidation in
most cases.
Linux kernel v4.20 introduced filtering support for RTM_GETROUTE dumps.
So, we can make use of it and dump only standard tables when we get a
relevant route update. NETLINK_GET_STRICT_CHK has to be enabled on
the socket for filtering to work. There is no reason to not enable it
by default, if supported. It is not used outside of NETLINK_ROUTE.
Fixes: f0e167f0dbad ("route-table: Handle route updates more robustly.")
Fixes: ea83a2fcd0d3 ("lib: Show tunnel egress interface in ovsdb")
Reported-at: https://github.com/openvswitch/ovs-issues/issues/185
Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-October/052091.html
Acked-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-03-20 19:47:21 +01:00
|
|
|
/* Strict checking of netlink arguments introduced in Linux kernel v4.20. */
|
|
|
|
#ifndef NETLINK_GET_STRICT_CHK
|
|
|
|
#define NETLINK_GET_STRICT_CHK 12
|
|
|
|
#endif
|
|
|
|
|
2011-09-06 09:33:26 -07:00
|
|
|
/* These were introduced all together in 2.6.23. (We want our programs to
|
|
|
|
* support the newer kernel features even if compiled with older headers.) */
|
|
|
|
#ifndef CTRL_ATTR_MCAST_GRP_MAX
|
|
|
|
|
|
|
|
#undef CTRL_ATTR_MAX
|
|
|
|
#define CTRL_ATTR_MAX 7
|
|
|
|
#define CTRL_ATTR_MCAST_GROUPS 7
|
|
|
|
|
|
|
|
enum {
|
2012-03-23 11:43:54 -07:00
|
|
|
CTRL_ATTR_MCAST_GRP_UNSPEC,
|
|
|
|
CTRL_ATTR_MCAST_GRP_NAME,
|
|
|
|
CTRL_ATTR_MCAST_GRP_ID,
|
|
|
|
__CTRL_ATTR_MCAST_GRP_MAX,
|
2011-09-06 09:33:26 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
|
|
|
|
#endif /* CTRL_ATTR_MCAST_GRP_MAX */
|
|
|
|
|
2021-08-11 17:43:49 +02:00
|
|
|
#ifndef NETLINK_EXT_ACK
|
|
|
|
|
|
|
|
#define NETLINK_CAP_ACK 10
|
|
|
|
#define NETLINK_EXT_ACK 11
|
|
|
|
|
|
|
|
/* ACK message flags. */
|
|
|
|
#define NLM_F_CAPPED 0x100
|
|
|
|
#define NLM_F_ACK_TLVS 0x200
|
|
|
|
|
|
|
|
enum {
|
|
|
|
NLMSGERR_ATTR_UNUSED,
|
|
|
|
NLMSGERR_ATTR_MSG,
|
|
|
|
NLMSGERR_ATTR_OFFS,
|
|
|
|
__NLMSGERR_ATTR_MAX,
|
|
|
|
NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* NLM_F_ACK_TLVS */
|
|
|
|
|
2009-07-08 13:19:16 -07:00
|
|
|
#endif /* netlink-protocol.h */
|