2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-24 02:47:14 +00:00
ovs/lib/netlink-socket.h

135 lines
4.9 KiB
C
Raw Normal View History

/*
* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef NETLINK_SOCKET_H
#define NETLINK_SOCKET_H 1
/* Netlink socket definitions.
*
* Netlink is a datagram-based network protocol primarily for communication
* between user processes and the kernel, and mainly on Linux. Netlink is
* specified in RFC 3549, "Linux Netlink as an IP Services Protocol".
*
* Netlink is not suitable for use in physical networks of heterogeneous
* machines because host byte order is used throughout.
*
* This header file defines functions for working with Netlink sockets, which
* are Linux-specific. For Netlink protocol definitions, see
* netlink-protocol.h. For helper functions for working with Netlink messages,
* see netlink.h.
*
*
* Thread-safety
* =============
*
* Most of the netlink functions are not fully thread-safe: Only a single
* thread may use a given nl_sock or nl_dump at one time. The exceptions are:
*
* - nl_sock_recv() is conditionally thread-safe: it may be called from
* different threads with the same nl_sock, but each caller must provide
* an independent receive buffer.
*
* - nl_dump_next() is conditionally thread-safe: it may be called from
* different threads with the same nl_dump, but each caller must provide
* independent buffers.
*/
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "ofpbuf.h"
#include "ovs-atomic.h"
netlink-socket: Work around kernel Netlink dump thread races. The Linux kernel Netlink implementation has two races that cause problems for processes that attempt to dump a table in a multithreaded manner. The first race is in the structure of the kernel netlink_recv() function. This function pulls a message from the socket queue and, if there is none, reports EAGAIN: skb = skb_recv_datagram(sk, flags, noblock, &err); if (skb == NULL) goto out; Only if a message is successfully read from the socket queue does the function, toward the end, try to queue up a new message to be dumped: if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { sk->sk_err = ret; sk->sk_error_report(sk); } } This means that if thread A reads a message from a dump, then thread B attempts to read one before A queues up the next, B will get EAGAIN. This means that, following EAGAIN, B needs to wait until A returns to userspace before it tries to read the socket again. nl_dump_next() already does this, using 'dump->status_seq' (although the need for it has never been explained clearly, to my knowledge). The second race is more serious. Suppose thread X and thread Y both simultaneously attempt to queue up a new message to be dumped, using the call to netlink_dump() quoted above. netlink_dump() begins with: mutex_lock(nlk->cb_mutex); cb = nlk->cb; if (cb == NULL) { err = -EINVAL; goto errout_skb; } Suppose that X gets cb_mutex first and finds that the dump is complete. It will therefore, toward the end of netlink_dump(), clear nlk->cb to NULL to indicate that no dump is in progress and release the mutex: nlk->cb = NULL; mutex_unlock(nlk->cb_mutex); When Y grabs cb_mutex afterward, it will see that nlk->cb is NULL and return -EINVAL as quoted above. netlink_recv() stuffs -EINVAL in sk_err, but that error is not reported immediately; instead, it is saved for the next read from the socket. Since Open vSwitch maintains a pool of Netlink sockets, that next failure can crop up pretty much anywhere. One of the worst places for it to crop up is in the execution of a later transaction (e.g. in nl_sock_transact_multiple__()), because userspace treats Netlink transactions as idempotent and will re-execute them when socket errors occur. For a transaction that sends a packet, this causes packet duplication, which we actually observed in practice. (ENOBUFS should actually cause transactions to be re-executed in many cases, but EINVAL should not; this is a separate bug in the userspace netlink code.) VMware-BZ: #1283188 Reported-and-tested-by: Alex Wang <alexw@nicira.com> Signed-off-by: Ben Pfaff <blp@nicira.com> Acked-by: Alex Wang <alexw@nicira.com>
2014-07-10 16:48:16 -07:00
#include "ovs-thread.h"
struct nl_sock;
#ifndef HAVE_NETLINK
#error "netlink-socket.h is only for hosts that support Netlink sockets"
#endif
/* Netlink sockets. */
int nl_sock_create(int protocol, struct nl_sock **);
netlink-socket: Make dumping and doing transactions on same nl_sock safe. It's not safe to use a single Netlink fd to do multiple operations in an synchronous way. Some of the limitations are fundamental; for example, the kernel only supports a single "dump" operation at a time. Others are limitations imposed by the OVS coding style; for example, our Netlink library is not callback based, so nothing can be done about incoming messages that can't be handled immediately. Regardless, in OVS multicast groups, transactions, and dumps cannot coexist on a single nl_sock. This is only mildly irritating at the moment, but it will become much worse later on, when dpif-linux shifts to using Netlink dumps for listing various kinds of datapath entities. When that happens, a dump will be in progress in situations where the dpif-linux client might want to do other operations. For example, it is reasonable for the client to list flows and, in the middle, look up information on vports mentioned in those flows. It might be possible to simply ban and avoid such nested operations--I have not even audited the source tree to find out whether we do anything like that already--but that seems like an unnecessary cramp on our coding style. Furthermore, it's difficult to explain and justify without understanding the implementation. This patch takes another approach, by improving the Netlink socket library to avoid artificial constraints. When an operation, or a dump, or joining a multicast group would cause a problem, this patch makes the library transparently create a separate Netlink socket. This solves the problem without putting any onerous restrictions on use. This commit also slightly simplifies netdev_vport_reset_names(). It had been written to destroy the dump object before the Netlink socket that it used, but this is no longer necessary and doing it in the opposite order saved a few lines of code. Reviewed by Ethan Jackson <ethan@nicira.com>.
2011-01-22 15:23:10 -08:00
int nl_sock_clone(const struct nl_sock *, struct nl_sock **);
void nl_sock_destroy(struct nl_sock *);
int nl_sock_join_mcgroup(struct nl_sock *, unsigned int multicast_group);
int nl_sock_leave_mcgroup(struct nl_sock *, unsigned int multicast_group);
int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
uint32_t nlmsg_seq, bool wait);
int nl_sock_recv(struct nl_sock *, struct ofpbuf *, bool wait);
int nl_sock_transact(struct nl_sock *, const struct ofpbuf *request,
struct ofpbuf **replyp);
int nl_sock_drain(struct nl_sock *);
void nl_sock_wait(const struct nl_sock *, short int events);
int nl_sock_fd(const struct nl_sock *);
uint32_t nl_sock_pid(const struct nl_sock *);
/* Batching transactions. */
struct nl_transaction {
/* Filled in by client. */
struct ofpbuf *request; /* Request to send. */
/* The client must initialize 'reply' to one of:
*
* - NULL, if it does not care to examine the reply.
*
* - Otherwise, to an ofpbuf with a memory allocation of at least
* NLMSG_HDRLEN bytes.
*/
struct ofpbuf *reply; /* Reply (empty if reply was an error code). */
int error; /* Positive errno value, 0 if no error. */
};
void nl_sock_transact_multiple(struct nl_sock *,
struct nl_transaction **, size_t n);
/* Transactions without an allocated socket. */
int nl_transact(int protocol, const struct ofpbuf *request,
struct ofpbuf **replyp);
void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);
/* Table dumping. */
#define NL_DUMP_BUFSIZE 4096
struct nl_dump {
/* These members are immutable during the lifetime of the nl_dump. */
struct nl_sock *sock; /* Socket being dumped. */
uint32_t nl_seq; /* Expected nlmsg_seq for replies. */
/* 'mutex' protects 'status' and serializes access to 'sock'. */
struct ovs_mutex mutex; /* Protects 'status', synchronizes recv(). */
int status OVS_GUARDED; /* 0: dump in progress,
* positive errno: dump completed with error,
* EOF: dump completed successfully. */
};
void nl_dump_start(struct nl_dump *, int protocol,
const struct ofpbuf *request);
bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
int nl_dump_done(struct nl_dump *);
/* Miscellaneous */
int nl_lookup_genl_family(const char *name, int *number);
int nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
unsigned int *multicast_group);
#endif /* netlink-socket.h */