ovs/lib/netlink-socket.c

/*
 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "netlink-socket.h"
#include <errno.h>
#include <inttypes.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h>
#include "coverage.h"
#include "dynamic-string.h"
#include "hash.h"
#include "hmap.h"
#include "netlink.h"
#include "netlink-protocol.h"
#include "odp-netlink.h"
#include "ofpbuf.h"
#include "ovs-thread.h"
#include "poll-loop.h"
#include "seq.h"
#include "socket-util.h"
#include "util.h"
#include "vlog.h"

VLOG_DEFINE_THIS_MODULE(netlink_socket);

COVERAGE_DEFINE(netlink_overflow);
COVERAGE_DEFINE(netlink_received);
COVERAGE_DEFINE(netlink_recv_jumbo);
COVERAGE_DEFINE(netlink_sent);

/* Linux header file confusion causes this to be undefined. */
#ifndef SOL_NETLINK
#define SOL_NETLINK 270
#endif

#ifdef _WIN32
static struct ovs_mutex portid_mutex = OVS_MUTEX_INITIALIZER;
static uint32_t g_last_portid = 0;

/* Port IDs must be unique! */
static uint32_t
portid_next(void)
    OVS_GUARDED_BY(portid_mutex)
{
    g_last_portid++;
    return g_last_portid;
}
#endif /* _WIN32 */

/* A single (bad) Netlink message can in theory dump out many, many log
 * messages, so the burst size is set quite high here to avoid missing useful
 * information.  Also, at high logging levels we log *all* Netlink messages. */
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);

static uint32_t nl_sock_allocate_seq(struct nl_sock *, unsigned int n);
static void log_nlmsg(const char *function, int error,
                      const void *message, size_t size, int protocol);
#ifdef _WIN32
static int get_sock_pid_from_kernel(struct nl_sock *sock);
#endif

/* Netlink sockets. */

struct nl_sock {
#ifdef _WIN32
    HANDLE handle;
#else
    int fd;
#endif
    uint32_t next_seq;
    uint32_t pid;
    int protocol;
    unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
};

/* Compile-time limit on iovecs, so that we can allocate a maximum-size array
 * of iovecs on the stack. */
#define MAX_IOVS 128

/* Maximum number of iovecs that may be passed to sendmsg, capped at a
 * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
 *
 * Initialized by nl_sock_create(). */
static int max_iovs;

static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
static void nl_pool_release(struct nl_sock *);

/* Creates a new netlink socket for the given netlink 'protocol'
 * (NETLINK_ROUTE, NETLINK_GENERIC, ...).  Returns 0 and sets '*sockp' to the
 * new socket if successful, otherwise returns a positive errno value. */
int
nl_sock_create(int protocol, struct nl_sock **sockp)
{
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
    struct nl_sock *sock;
#ifndef _WIN32
    struct sockaddr_nl local, remote;
#endif
    socklen_t local_size;
    int rcvbuf;
    int retval = 0;

    if (ovsthread_once_start(&once)) {
        int save_errno = errno;
        errno = 0;

        max_iovs = sysconf(_SC_UIO_MAXIOV);
        if (max_iovs < _XOPEN_IOV_MAX) {
            if (max_iovs == -1 && errno) {
                VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", ovs_strerror(errno));
            }
            max_iovs = _XOPEN_IOV_MAX;
        } else if (max_iovs > MAX_IOVS) {
            max_iovs = MAX_IOVS;
        }

        errno = save_errno;
        ovsthread_once_done(&once);
    }

    *sockp = NULL;
    sock = xmalloc(sizeof *sock);

#ifdef _WIN32
    sock->handle = CreateFileA("\\\\.\\OpenVSwitchDevice",
                               GENERIC_READ | GENERIC_WRITE,
                               FILE_SHARE_READ | FILE_SHARE_WRITE,
                               NULL, OPEN_EXISTING,
                               FILE_ATTRIBUTE_NORMAL, NULL);

    int last_error = GetLastError();

    if (sock->handle == INVALID_HANDLE_VALUE) {
        VLOG_ERR("fcntl: %s", ovs_strerror(last_error));
        goto error;
    }
#else
    sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
    if (sock->fd < 0) {
        VLOG_ERR("fcntl: %s", ovs_strerror(errno));
        goto error;
    }
#endif

    sock->protocol = protocol;
    sock->next_seq = 1;

    rcvbuf = 1024 * 1024;
#ifdef _WIN32
    sock->rcvbuf = rcvbuf;
    retval = get_sock_pid_from_kernel(sock);
    if (retval != 0) {
        goto error;
    }
#else
    if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUFFORCE,
                   &rcvbuf, sizeof rcvbuf)) {
        /* Only root can use SO_RCVBUFFORCE.  Everyone else gets EPERM.
         * Warn only if the failure is therefore unexpected. */
        if (errno != EPERM) {
            VLOG_WARN_RL(&rl, "setting %d-byte socket receive buffer failed "
                         "(%s)", rcvbuf, ovs_strerror(errno));
        }
    }

    retval = get_socket_rcvbuf(sock->fd);
    if (retval < 0) {
        retval = -retval;
        goto error;
    }
    sock->rcvbuf = retval;

    /* Connect to kernel (pid 0) as remote address. */
    memset(&remote, 0, sizeof remote);
    remote.nl_family = AF_NETLINK;
    remote.nl_pid = 0;
    if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) {
        VLOG_ERR("connect(0): %s", ovs_strerror(errno));
        goto error;
    }

    /* Obtain pid assigned by kernel. */
    local_size = sizeof local;
    if (getsockname(sock->fd, (struct sockaddr *) &local, &local_size) < 0) {
        VLOG_ERR("getsockname: %s", ovs_strerror(errno));
        goto error;
    }
    if (local_size < sizeof local || local.nl_family != AF_NETLINK) {
        VLOG_ERR("getsockname returned bad Netlink name");
        retval = EINVAL;
        goto error;
    }
    sock->pid = local.nl_pid;
#endif

    *sockp = sock;
    return 0;

error:
    if (retval == 0) {
        retval = errno;
        if (retval == 0) {
            retval = EINVAL;
        }
    }
#ifdef _WIN32
    if (sock->handle != INVALID_HANDLE_VALUE) {
        CloseHandle(sock->handle);
    }
#else
    if (sock->fd >= 0) {
        close(sock->fd);
    }
#endif
    free(sock);
    return retval;
}

/* Creates a new netlink socket for the same protocol as 'src'.  Returns 0 and
 * sets '*sockp' to the new socket if successful, otherwise returns a positive
 * errno value.  */
int
nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp)
{
    return nl_sock_create(src->protocol, sockp);
}

/* Destroys netlink socket 'sock'. */
void
nl_sock_destroy(struct nl_sock *sock)
{
    if (sock) {
#ifdef _WIN32
        CloseHandle(sock->handle);
#else
        close(sock->fd);
#endif
        free(sock);
    }
}

#ifdef _WIN32
/* Reads the pid for 'sock' generated in the kernel datapath. The function
 * follows a transaction semantic. Eventually this function should call into
 * nl_transact. */
static int
get_sock_pid_from_kernel(struct nl_sock *sock)
{
    struct nl_transaction txn;
    struct ofpbuf request;
    uint64_t request_stub[128];
    struct ofpbuf reply;
    uint64_t reply_stub[128];
    struct ovs_header *ovs_header;
    struct nlmsghdr *nlmsg;
    uint32_t seq;
    int retval;
    DWORD bytes;
    int ovs_msg_size = sizeof (struct nlmsghdr) + sizeof (struct genlmsghdr) +
                       sizeof (struct ovs_header);

    ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
    txn.request = &request;
    ofpbuf_use_stub(&reply, reply_stub, sizeof reply_stub);
    txn.reply = &reply;

    seq = nl_sock_allocate_seq(sock, 1);
    nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
                          OVS_CTRL_CMD_WIN_GET_PID, OVS_WIN_CONTROL_VERSION);
    nlmsg = nl_msg_nlmsghdr(txn.request);
    nlmsg->nlmsg_seq = seq;

    ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
    ovs_header->dp_ifindex = 0;
    ovs_header = ofpbuf_put_uninit(&reply, ovs_msg_size);

    if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
                         ofpbuf_data(txn.request), ofpbuf_size(txn.request),
                         ofpbuf_data(txn.reply), ofpbuf_size(txn.reply),
                         &bytes, NULL)) {
        retval = EINVAL;
        goto done;
    } else {
        if (bytes < ovs_msg_size) {
            retval = EINVAL;
            goto done;
        }

        nlmsg = nl_msg_nlmsghdr(txn.reply);
        if (nlmsg->nlmsg_seq != seq) {
            retval = EINVAL;
            goto done;
        }
        sock->pid = nlmsg->nlmsg_pid;
    }
    retval = 0;

done:
    ofpbuf_uninit(&request);
    ofpbuf_uninit(&reply);
    return retval;
}
#endif  /* _WIN32 */

/* Tries to add 'sock' as a listener for 'multicast_group'.  Returns 0 if
 * successful, otherwise a positive errno value.
 *
 * A socket that is subscribed to a multicast group that receives asynchronous
 * notifications must not be used for Netlink transactions or dumps, because
 * transactions and dumps can cause notifications to be lost.
 *
 * Multicast group numbers are always positive.
 *
 * It is not an error to attempt to join a multicast group to which a socket
 * already belongs. */
int
nl_sock_join_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
{
#ifdef _WIN32
#define OVS_VPORT_MCGROUP_FALLBACK_ID 33
    struct ofpbuf msg_buf;
    struct message_multicast
    {
        struct nlmsghdr;
        /* if true, join; if else, leave */
        unsigned char join;
        unsigned int groupId;
    };

    struct message_multicast msg = { 0 };

    msg.nlmsg_len = sizeof(struct message_multicast);
    msg.nlmsg_type = OVS_VPORT_MCGROUP_FALLBACK_ID;
    msg.nlmsg_flags = 0;
    msg.nlmsg_seq = 0;
    msg.nlmsg_pid = sock->pid;

    msg.join = 1;
    msg.groupId = multicast_group;
    msg_buf.base_ = &msg;
    msg_buf.data_ = &msg;
    msg_buf.size_ = msg.nlmsg_len;

    nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
#else
    if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
                   &multicast_group, sizeof multicast_group) < 0) {
        VLOG_WARN("could not join multicast group %u (%s)",
                  multicast_group, ovs_strerror(errno));
        return errno;
    }
#endif
    return 0;
}

/* Tries to make 'sock' stop listening to 'multicast_group'.  Returns 0 if
 * successful, otherwise a positive errno value.
 *
 * Multicast group numbers are always positive.
 *
 * It is not an error to attempt to leave a multicast group to which a socket
 * does not belong.
 *
 * On success, reading from 'sock' will still return any messages that were
 * received on 'multicast_group' before the group was left. */
int
nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
{
#ifdef _WIN32
    struct ofpbuf msg_buf;
    struct message_multicast
    {
        struct nlmsghdr;
        /* if true, join; if else, leave*/
        unsigned char join;
    };

    struct message_multicast msg = { 0 };
    nl_msg_put_nlmsghdr(&msg, sizeof(struct message_multicast),
                        multicast_group, 0);
    msg.join = 0;

    msg_buf.base_ = &msg;
    msg_buf.data_ = &msg;
    msg_buf.size_ = msg.nlmsg_len;

    nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
#else
    if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_DROP_MEMBERSHIP,
                   &multicast_group, sizeof multicast_group) < 0) {
        VLOG_WARN("could not leave multicast group %u (%s)",
                  multicast_group, ovs_strerror(errno));
        return errno;
    }
#endif
    return 0;
}

static int
nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
               uint32_t nlmsg_seq, bool wait)
{
    struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(msg);
    int error;

    nlmsg->nlmsg_len = ofpbuf_size(msg);
    nlmsg->nlmsg_seq = nlmsg_seq;
    nlmsg->nlmsg_pid = sock->pid;
    do {
        int retval;
#ifdef _WIN32
        bool result;
        DWORD last_error = 0;
        result = WriteFile(sock->handle, ofpbuf_data(msg), ofpbuf_size(msg),
                           &retval, NULL);
        last_error = GetLastError();
        if (last_error != ERROR_SUCCESS && !result) {
            retval = -1;
            errno = EAGAIN;
        }
#else
        retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
#endif
        error = retval < 0 ? errno : 0;
    } while (error == EINTR);
    log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
    if (!error) {
        COVERAGE_INC(netlink_sent);
    }
    return error;
}

/* Tries to send 'msg', which must contain a Netlink message, to the kernel on
 * 'sock'.  nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to a fresh
 * sequence number, before the message is sent.
 *
 * Returns 0 if successful, otherwise a positive errno value.  If
 * 'wait' is true, then the send will wait until buffer space is ready;
 * otherwise, returns EAGAIN if the 'sock' send buffer is full. */
int
nl_sock_send(struct nl_sock *sock, const struct ofpbuf *msg, bool wait)
{
    return nl_sock_send_seq(sock, msg, nl_sock_allocate_seq(sock, 1), wait);
}

/* Tries to send 'msg', which must contain a Netlink message, to the kernel on
 * 'sock'.  nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to
 * 'nlmsg_seq', before the message is sent.
 *
 * Returns 0 if successful, otherwise a positive errno value.  If
 * 'wait' is true, then the send will wait until buffer space is ready;
 * otherwise, returns EAGAIN if the 'sock' send buffer is full.
 *
 * This function is suitable for sending a reply to a request that was received
 * with sequence number 'nlmsg_seq'.  Otherwise, use nl_sock_send() instead. */
int
nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg,
                 uint32_t nlmsg_seq, bool wait)
{
    return nl_sock_send__(sock, msg, nlmsg_seq, wait);
}

static int
nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
    /* We can't accurately predict the size of the data to be received.  The
     * caller is supposed to have allocated enough space in 'buf' to handle the
     * "typical" case.  To handle exceptions, we make available enough space in
     * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
     * figure since that's the maximum length of a Netlink attribute). */
    struct nlmsghdr *nlmsghdr;
#ifdef _WIN32
#define MAX_STACK_LENGTH 81920
    uint8_t tail[MAX_STACK_LENGTH];
#else
    uint8_t tail[65536];
#endif
    struct iovec iov[2];
    struct msghdr msg;
    ssize_t retval;
    int error;

    ovs_assert(buf->allocated >= sizeof *nlmsghdr);
    ofpbuf_clear(buf);

    iov[0].iov_base = ofpbuf_base(buf);
    iov[0].iov_len = buf->allocated;
    iov[1].iov_base = tail;
    iov[1].iov_len = sizeof tail;

    memset(&msg, 0, sizeof msg);
    msg.msg_iov = iov;
    msg.msg_iovlen = 2;

    /* Receive a Netlink message from the kernel.
     *
     * This works around a kernel bug in which the kernel returns an error code
     * as if it were the number of bytes read.  It doesn't actually modify
     * anything in the receive buffer in that case, so we can initialize the
     * Netlink header with an impossible message length and then, upon success,
     * check whether it changed. */
    nlmsghdr = ofpbuf_base(buf);
    do {
        nlmsghdr->nlmsg_len = UINT32_MAX;
#ifdef _WIN32
        boolean result = false;
        DWORD last_error = 0;
        result = ReadFile(sock->handle, tail, MAX_STACK_LENGTH, &retval, NULL);
        last_error = GetLastError();
        if (last_error != ERROR_SUCCESS && !result) {
            retval = -1;
            errno = EAGAIN;
        } else {
            ofpbuf_put(buf, tail, retval);
        }
#else
        retval = recvmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT);
#endif
        error = (retval < 0 ? errno
                 : retval == 0 ? ECONNRESET /* not possible? */
                 : nlmsghdr->nlmsg_len != UINT32_MAX ? 0
                 : retval);
    } while (error == EINTR);
    if (error) {
        if (error == ENOBUFS) {
            /* Socket receive buffer overflow dropped one or more messages that
             * the kernel tried to send to us. */
            COVERAGE_INC(netlink_overflow);
        }
        return error;
    }

    if (msg.msg_flags & MSG_TRUNC) {
        VLOG_ERR_RL(&rl, "truncated message (longer than %"PRIuSIZE" bytes)",
                    sizeof tail);
        return E2BIG;
    }

    if (retval < sizeof *nlmsghdr
        || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
        || nlmsghdr->nlmsg_len > retval) {
        VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE" bytes < %"PRIuSIZE")",
                    retval, sizeof *nlmsghdr);
        return EPROTO;
    }
#ifndef _WIN32
    ofpbuf_set_size(buf, MIN(retval, buf->allocated));
    if (retval > buf->allocated) {
        COVERAGE_INC(netlink_recv_jumbo);
        ofpbuf_put(buf, tail, retval - buf->allocated);
    }
#endif

    log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol);
    COVERAGE_INC(netlink_received);

    return 0;
}

/* Tries to receive a Netlink message from the kernel on 'sock' into 'buf'.  If
 * 'wait' is true, waits for a message to be ready.  Otherwise, fails with
 * EAGAIN if the 'sock' receive buffer is empty.
 *
 * The caller must have initialized 'buf' with an allocation of at least
 * NLMSG_HDRLEN bytes.  For best performance, the caller should allocate enough
 * space for a "typical" message.
 *
 * On success, returns 0 and replaces 'buf''s previous content by the received
 * message.  This function expands 'buf''s allocated memory, as necessary, to
 * hold the actual size of the received message.
 *
 * On failure, returns a positive errno value and clears 'buf' to zero length.
 * 'buf' retains its previous memory allocation.
 *
 * Regardless of success or failure, this function resets 'buf''s headroom to
 * 0. */
int
nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
    return nl_sock_recv__(sock, buf, wait);
}

static void
nl_sock_record_errors__(struct nl_transaction **transactions, size_t n,
                        int error)
{
    size_t i;

    for (i = 0; i < n; i++) {
        struct nl_transaction *txn = transactions[i];

        txn->error = error;
        if (txn->reply) {
            ofpbuf_clear(txn->reply);
        }
    }
}

static int
nl_sock_transact_multiple__(struct nl_sock *sock,
                            struct nl_transaction **transactions, size_t n,
                            size_t *done)
{
    uint64_t tmp_reply_stub[1024 / 8];
    struct nl_transaction tmp_txn;
    struct ofpbuf tmp_reply;

    uint32_t base_seq;
    struct iovec iovs[MAX_IOVS];
    struct msghdr msg;
    int error;
    int i;

    base_seq = nl_sock_allocate_seq(sock, n);
    *done = 0;
    for (i = 0; i < n; i++) {
        struct nl_transaction *txn = transactions[i];
        struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(txn->request);

        nlmsg->nlmsg_len = ofpbuf_size(txn->request);
        nlmsg->nlmsg_seq = base_seq + i;
        nlmsg->nlmsg_pid = sock->pid;

        iovs[i].iov_base = ofpbuf_data(txn->request);
        iovs[i].iov_len = ofpbuf_size(txn->request);
    }

    memset(&msg, 0, sizeof msg);
    msg.msg_iov = iovs;
    msg.msg_iovlen = n;
    do {
#ifdef _WIN32
    DWORD last_error = 0;
    bool result = FALSE;
    for (i = 0; i < n; i++) {
        result = WriteFile((HANDLE)sock->handle, iovs[i].iov_base, iovs[i].iov_len,
                           &error, NULL);
        last_error = GetLastError();
        if (last_error != ERROR_SUCCESS && !result) {
            error = EAGAIN;
            errno = EAGAIN;
        } else {
            error = 0;
        }
    }
#else
        error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0;
#endif
    } while (error == EINTR);

    for (i = 0; i < n; i++) {
        struct nl_transaction *txn = transactions[i];

        log_nlmsg(__func__, error, ofpbuf_data(txn->request), ofpbuf_size(txn->request),
                  sock->protocol);
    }
    if (!error) {
        COVERAGE_ADD(netlink_sent, n);
    }

    if (error) {
        return error;
    }

    ofpbuf_use_stub(&tmp_reply, tmp_reply_stub, sizeof tmp_reply_stub);
    tmp_txn.request = NULL;
    tmp_txn.reply = &tmp_reply;
    tmp_txn.error = 0;
    while (n > 0) {
        struct nl_transaction *buf_txn, *txn;
        uint32_t seq;

        /* Find a transaction whose buffer we can use for receiving a reply.
         * If no such transaction is left, use tmp_txn. */
        buf_txn = &tmp_txn;
        for (i = 0; i < n; i++) {
            if (transactions[i]->reply) {
                buf_txn = transactions[i];
                break;
            }
        }

        /* Receive a reply. */
        error = nl_sock_recv__(sock, buf_txn->reply, false);
        if (error) {
            if (error == EAGAIN) {
                nl_sock_record_errors__(transactions, n, 0);
                *done += n;
                error = 0;
            }
            break;
        }

        /* Match the reply up with a transaction. */
        seq = nl_msg_nlmsghdr(buf_txn->reply)->nlmsg_seq;
        if (seq < base_seq || seq >= base_seq + n) {
            VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32, seq);
            continue;
        }
        i = seq - base_seq;
        txn = transactions[i];

        /* Fill in the results for 'txn'. */
        if (nl_msg_nlmsgerr(buf_txn->reply, &txn->error)) {
            if (txn->reply) {
                ofpbuf_clear(txn->reply);
            }
            if (txn->error) {
                VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
                            error, ovs_strerror(txn->error));
            }
        } else {
            txn->error = 0;
            if (txn->reply && txn != buf_txn) {
                /* Swap buffers. */
                struct ofpbuf *reply = buf_txn->reply;
                buf_txn->reply = txn->reply;
                txn->reply = reply;
            }
        }

        /* Fill in the results for transactions before 'txn'.  (We have to do
         * this after the results for 'txn' itself because of the buffer swap
         * above.) */
        nl_sock_record_errors__(transactions, i, 0);

        /* Advance. */
        *done += i + 1;
        transactions += i + 1;
        n -= i + 1;
        base_seq += i + 1;
    }
    ofpbuf_uninit(&tmp_reply);

    return error;
}

static void
nl_sock_transact_multiple(struct nl_sock *sock,
                          struct nl_transaction **transactions, size_t n)
{
    int max_batch_count;
    int error;

    if (!n) {
        return;
    }

    /* In theory, every request could have a 64 kB reply.  But the default and
     * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to
     * be a bit below 128 kB, so that would only allow a single message in a
     * "batch".  So we assume that replies average (at most) 4 kB, which allows
     * a good deal of batching.
     *
     * In practice, most of the requests that we batch either have no reply at
     * all or a brief reply. */
    max_batch_count = MAX(sock->rcvbuf / 4096, 1);
    max_batch_count = MIN(max_batch_count, max_iovs);

    while (n > 0) {
        size_t count, bytes;
        size_t done;

        /* Batch up to 'max_batch_count' transactions.  But cap it at about a
         * page of requests total because big skbuffs are expensive to
         * allocate in the kernel.  */
#if defined(PAGESIZE)
        enum { MAX_BATCH_BYTES = MAX(1, PAGESIZE - 512) };
#else
        enum { MAX_BATCH_BYTES = 4096 - 512 };
#endif
        bytes = ofpbuf_size(transactions[0]->request);
        for (count = 1; count < n && count < max_batch_count; count++) {
            if (bytes + ofpbuf_size(transactions[count]->request) > MAX_BATCH_BYTES) {
                break;
            }
            bytes += ofpbuf_size(transactions[count]->request);
        }

        error = nl_sock_transact_multiple__(sock, transactions, count, &done);
        transactions += done;
        n -= done;

        if (error == ENOBUFS) {
            VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
        } else if (error) {
            VLOG_ERR_RL(&rl, "transaction error (%s)", ovs_strerror(error));
            nl_sock_record_errors__(transactions, n, error);
        }
    }
}

static int
nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
                 struct ofpbuf **replyp)
{
    struct nl_transaction *transactionp;
    struct nl_transaction transaction;

    transaction.request = CONST_CAST(struct ofpbuf *, request);
    transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
    transactionp = &transaction;

    nl_sock_transact_multiple(sock, &transactionp, 1);

    if (replyp) {
        if (transaction.error) {
            ofpbuf_delete(transaction.reply);
            *replyp = NULL;
        } else {
            *replyp = transaction.reply;
        }
    }

    return transaction.error;
}

/* Drain all the messages currently in 'sock''s receive queue. */
int
nl_sock_drain(struct nl_sock *sock)
{
#ifdef _WIN32
    return 0;
#else
    return drain_rcvbuf(sock->fd);
#endif
}

/* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
 * Netlink socket created with the given 'protocol', and initializes 'dump' to
 * reflect the state of the operation.
 *
 * 'request' must contain a Netlink message.  Before sending the message,
 * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
 * set to the Netlink socket's pid.  NLM_F_DUMP and NLM_F_ACK will be set in
 * nlmsg_flags.
 *
 * The design of this Netlink socket library ensures that the dump is reliable.
 *
 * This function provides no status indication.  nl_dump_done() provides an
 * error status for the entire dump operation.
 *
 * The caller must eventually destroy 'request'.
 */
void
nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
{
    nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;

    ovs_mutex_init(&dump->mutex);
    ovs_mutex_lock(&dump->mutex);
    dump->status = nl_pool_alloc(protocol, &dump->sock);
    if (!dump->status) {
        dump->status = nl_sock_send__(dump->sock, request,
                                      nl_sock_allocate_seq(dump->sock, 1),
                                      true);
    }
    dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
    ovs_mutex_unlock(&dump->mutex);
}

static int
nl_dump_refill(struct nl_dump *dump, struct ofpbuf *buffer)
    OVS_REQUIRES(dump->mutex)
{
    struct nlmsghdr *nlmsghdr;
    int error;

    while (!ofpbuf_size(buffer)) {
        error = nl_sock_recv__(dump->sock, buffer, false);
        if (error) {
            /* The kernel never blocks providing the results of a dump, so
             * error == EAGAIN means that we've read the whole thing, and
             * therefore transform it into EOF.  (The kernel always provides
             * NLMSG_DONE as a sentinel.  Some other thread must have received
             * that already but not yet signaled it in 'status'.)
             *
             * Any other error is just an error. */
            return error == EAGAIN ? EOF : error;
        }

        nlmsghdr = nl_msg_nlmsghdr(buffer);
        if (dump->nl_seq != nlmsghdr->nlmsg_seq) {
            VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32,
                        nlmsghdr->nlmsg_seq, dump->nl_seq);
            ofpbuf_clear(buffer);
        }
    }

    if (nl_msg_nlmsgerr(buffer, &error) && error) {
        VLOG_INFO_RL(&rl, "netlink dump request error (%s)",
                     ovs_strerror(error));
        ofpbuf_clear(buffer);
        return error;
    }

    return 0;
}

static int
nl_dump_next__(struct ofpbuf *reply, struct ofpbuf *buffer)
{
    struct nlmsghdr *nlmsghdr = nl_msg_next(buffer, reply);
    if (!nlmsghdr) {
        VLOG_WARN_RL(&rl, "netlink dump contains message fragment");
        return EPROTO;
    } else if (nlmsghdr->nlmsg_type == NLMSG_DONE) {
        return EOF;
    } else {
        return 0;
    }
}

/* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
 * have been initialized with nl_dump_start(), and 'buffer' must have been
 * initialized. 'buffer' should be at least NL_DUMP_BUFSIZE bytes long.
 *
 * If successful, returns true and points 'reply->data' and
 * 'ofpbuf_size(reply)' to the message that was retrieved. The caller must not
 * modify 'reply' (because it points within 'buffer', which will be used by
 * future calls to this function).
 *
 * On failure, returns false and sets 'reply->data' to NULL and
 * 'ofpbuf_size(reply)' to 0.  Failure might indicate an actual error or merely
 * the end of replies.  An error status for the entire dump operation is
 * provided when it is completed by calling nl_dump_done().
 *
 * Multiple threads may call this function, passing the same nl_dump, however
 * each must provide independent buffers. This function may cache multiple
 * replies in the buffer, and these will be processed before more replies are
 * fetched. When this function returns false, other threads may continue to
 * process replies in their buffers, but they will not fetch more replies.
 */
bool
nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
{
    int retval = 0;

    /* If the buffer is empty, refill it.
     *
     * If the buffer is not empty, we don't check the dump's status.
     * Otherwise, we could end up skipping some of the dump results if thread A
     * hits EOF while thread B is in the midst of processing a batch. */
    if (!ofpbuf_size(buffer)) {
        ovs_mutex_lock(&dump->mutex);
        if (!dump->status) {
            /* Take the mutex here to avoid an in-kernel race.  If two threads
             * try to read from a Netlink dump socket at once, then the socket
             * error can be set to EINVAL, which will be encountered on the
             * next recv on that socket, which could be anywhere due to the way
             * that we pool Netlink sockets.  Serializing the recv calls avoids
             * the issue. */
            dump->status = nl_dump_refill(dump, buffer);
        }
        retval = dump->status;
        ovs_mutex_unlock(&dump->mutex);
    }

    /* Fetch the next message from the buffer. */
    if (!retval) {
        retval = nl_dump_next__(reply, buffer);
        if (retval) {
            /* Record 'retval' as the dump status, but don't overwrite an error
             * with EOF.  */
            ovs_mutex_lock(&dump->mutex);
            if (dump->status <= 0) {
                dump->status = retval;
            }
            ovs_mutex_unlock(&dump->mutex);
        }
    }

    if (retval) {
        ofpbuf_set_data(reply, NULL);
        ofpbuf_set_size(reply, 0);
    }
    return !retval;
}

/* Completes Netlink dump operation 'dump', which must have been initialized
 * with nl_dump_start().  Returns 0 if the dump operation was error-free,
 * otherwise a positive errno value describing the problem. */
int
nl_dump_done(struct nl_dump *dump)
{
    int status;

    ovs_mutex_lock(&dump->mutex);
    status = dump->status;
    ovs_mutex_unlock(&dump->mutex);

    /* Drain any remaining messages that the client didn't read.  Otherwise the
     * kernel will continue to queue them up and waste buffer space.
     *
     * XXX We could just destroy and discard the socket in this case. */
    if (!status) {
        uint64_t tmp_reply_stub[NL_DUMP_BUFSIZE / 8];
        struct ofpbuf reply, buf;

        ofpbuf_use_stub(&buf, tmp_reply_stub, sizeof tmp_reply_stub);
        while (nl_dump_next(dump, &reply, &buf)) {
            /* Nothing to do. */
        }
        ofpbuf_uninit(&buf);

        ovs_mutex_lock(&dump->mutex);
        status = dump->status;
        ovs_mutex_unlock(&dump->mutex);
        ovs_assert(status);
    }

    nl_pool_release(dump->sock);
    ovs_mutex_destroy(&dump->mutex);

    return status == EOF ? 0 : status;
}

/* Causes poll_block() to wake up when any of the specified 'events' (which is
 * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'. */
void
nl_sock_wait(const struct nl_sock *sock, short int events)
{
#ifdef _WIN32
    poll_fd_wait(sock->handle, events);
#else
    poll_fd_wait(sock->fd, events);
#endif
}

/* Returns the underlying fd for 'sock', for use in "poll()"-like operations
 * that can't use nl_sock_wait().
 *
 * It's a little tricky to use the returned fd correctly, because nl_sock does
 * "copy on write" to allow a single nl_sock to be used for notifications,
 * transactions, and dumps.  If 'sock' is used only for notifications and
 * transactions (and never for dump) then the usage is safe. */
int
nl_sock_fd(const struct nl_sock *sock)
{
#ifdef _WIN32
    return sock->handle;
#else
    return sock->fd;
#endif
}

/* Returns the PID associated with this socket. */
uint32_t
nl_sock_pid(const struct nl_sock *sock)
{
    return sock->pid;
}

/* Miscellaneous.  */

struct genl_family {
    struct hmap_node hmap_node;
    uint16_t id;
    char *name;
};

static struct hmap genl_families = HMAP_INITIALIZER(&genl_families);

static const struct nl_policy family_policy[CTRL_ATTR_MAX + 1] = {
    [CTRL_ATTR_FAMILY_ID] = {.type = NL_A_U16},
    [CTRL_ATTR_MCAST_GROUPS] = {.type = NL_A_NESTED, .optional = true},
};

static struct genl_family *
find_genl_family_by_id(uint16_t id)
{
    struct genl_family *family;

    HMAP_FOR_EACH_IN_BUCKET (family, hmap_node, hash_int(id, 0),
                             &genl_families) {
        if (family->id == id) {
            return family;
        }
    }
    return NULL;
}

static void
define_genl_family(uint16_t id, const char *name)
{
    struct genl_family *family = find_genl_family_by_id(id);

    if (family) {
        if (!strcmp(family->name, name)) {
            return;
        }
        free(family->name);
    } else {
        family = xmalloc(sizeof *family);
        family->id = id;
        hmap_insert(&genl_families, &family->hmap_node, hash_int(id, 0));
    }
    family->name = xstrdup(name);
}

static const char *
genl_family_to_name(uint16_t id)
{
    if (id == GENL_ID_CTRL) {
        return "control";
    } else {
        struct genl_family *family = find_genl_family_by_id(id);
        return family ? family->name : "unknown";
    }
}

#ifndef _WIN32
static int
do_lookup_genl_family(const char *name, struct nlattr **attrs,
                      struct ofpbuf **replyp)
{
    struct nl_sock *sock;
    struct ofpbuf request, *reply;
    int error;

    *replyp = NULL;
    error = nl_sock_create(NETLINK_GENERIC, &sock);
    if (error) {
        return error;
    }

    ofpbuf_init(&request, 0);
    nl_msg_put_genlmsghdr(&request, 0, GENL_ID_CTRL, NLM_F_REQUEST,
                          CTRL_CMD_GETFAMILY, 1);
    nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name);
    error = nl_sock_transact(sock, &request, &reply);
    ofpbuf_uninit(&request);
    if (error) {
        nl_sock_destroy(sock);
        return error;
    }

    if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
                         family_policy, attrs, ARRAY_SIZE(family_policy))
        || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
        nl_sock_destroy(sock);
        ofpbuf_delete(reply);
        return EPROTO;
    }

    nl_sock_destroy(sock);
    *replyp = reply;
    return 0;
}
#else
static int
do_lookup_genl_family(const char *name, struct nlattr **attrs,
                      struct ofpbuf **replyp)
{
    struct nl_sock *sock;
    struct nlmsghdr *nlmsg;
    struct ofpbuf *reply;
    int error;
    uint16_t family_id;
    const char *family_name;
    uint32_t family_version;
    uint32_t family_attrmax;
    uint32_t mcgrp_id = OVS_WIN_NL_INVALID_MCGRP_ID;
    const char *mcgrp_name = NULL;

    *replyp = NULL;
    reply = ofpbuf_new(1024);

    /* CTRL_ATTR_MCAST_GROUPS is supported only for VPORT family. */
    if (!strcmp(name, OVS_WIN_CONTROL_FAMILY)) {
        family_id = OVS_WIN_NL_CTRL_FAMILY_ID;
        family_name = OVS_WIN_CONTROL_FAMILY;
        family_version = OVS_WIN_CONTROL_VERSION;
        family_attrmax = OVS_WIN_CONTROL_ATTR_MAX;
    } else if (!strcmp(name, OVS_DATAPATH_FAMILY)) {
        family_id = OVS_WIN_NL_DATAPATH_FAMILY_ID;
        family_name = OVS_DATAPATH_FAMILY;
        family_version = OVS_DATAPATH_VERSION;
        family_attrmax = OVS_DP_ATTR_MAX;
    } else if (!strcmp(name, OVS_PACKET_FAMILY)) {
        family_id = OVS_WIN_NL_PACKET_FAMILY_ID;
        family_name = OVS_PACKET_FAMILY;
        family_version = OVS_PACKET_VERSION;
        family_attrmax = OVS_PACKET_ATTR_MAX;
    } else if (!strcmp(name, OVS_VPORT_FAMILY)) {
        family_id = OVS_WIN_NL_VPORT_FAMILY_ID;
        family_name = OVS_VPORT_FAMILY;
        family_version = OVS_VPORT_VERSION;
        family_attrmax = OVS_VPORT_ATTR_MAX;
        mcgrp_id = OVS_WIN_NL_VPORT_MCGRP_ID;
        mcgrp_name = OVS_VPORT_MCGROUP;
    } else if (!strcmp(name, OVS_FLOW_FAMILY)) {
        family_id = OVS_WIN_NL_FLOW_FAMILY_ID;
        family_name = OVS_FLOW_FAMILY;
        family_version = OVS_FLOW_VERSION;
        family_attrmax = OVS_FLOW_ATTR_MAX;
    } else {
        ofpbuf_delete(reply);
        return EINVAL;
    }

    nl_msg_put_genlmsghdr(reply, 0, GENL_ID_CTRL, 0,
                          CTRL_CMD_NEWFAMILY, family_version);
    /* CTRL_ATTR_HDRSIZE and CTRL_ATTR_OPS are not populated, but the
     * callers do not seem to need them. */
    nl_msg_put_u16(reply, CTRL_ATTR_FAMILY_ID, family_id);
    nl_msg_put_string(reply, CTRL_ATTR_FAMILY_NAME, family_name);
    nl_msg_put_u32(reply, CTRL_ATTR_VERSION, family_version);
    nl_msg_put_u32(reply, CTRL_ATTR_MAXATTR, family_attrmax);

    if (mcgrp_id != OVS_WIN_NL_INVALID_MCGRP_ID) {
        size_t mcgrp_ofs1 = nl_msg_start_nested(reply, CTRL_ATTR_MCAST_GROUPS);
        size_t mcgrp_ofs2= nl_msg_start_nested(reply,
            OVS_WIN_NL_VPORT_MCGRP_ID - OVS_WIN_NL_MCGRP_START_ID);
        nl_msg_put_u32(reply, CTRL_ATTR_MCAST_GRP_ID, mcgrp_id);
        ovs_assert(mcgrp_name != NULL);
        nl_msg_put_string(reply, CTRL_ATTR_MCAST_GRP_NAME, mcgrp_name);
        nl_msg_end_nested(reply, mcgrp_ofs2);
        nl_msg_end_nested(reply, mcgrp_ofs1);
    }

    /* Set the total length of the netlink message. */
    nlmsg = nl_msg_nlmsghdr(reply);
    nlmsg->nlmsg_len = ofpbuf_size(reply);

    if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
                         family_policy, attrs, ARRAY_SIZE(family_policy))
        || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
        nl_sock_destroy(sock);
        ofpbuf_delete(reply);
        return EPROTO;
    }

    *replyp = reply;
    return 0;
}
#endif

/* Finds the multicast group called 'group_name' in genl family 'family_name'.
 * When successful, writes its result to 'multicast_group' and returns 0.
 * Otherwise, clears 'multicast_group' and returns a positive error code.
 */
int
nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
                       unsigned int *multicast_group)
{
    struct nlattr *family_attrs[ARRAY_SIZE(family_policy)];
    const struct nlattr *mc;
    struct ofpbuf *reply;
    unsigned int left;
    int error;

    *multicast_group = 0;
    error = do_lookup_genl_family(family_name, family_attrs, &reply);
    if (error) {
        return error;
    }

    if (!family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
        error = EPROTO;
        goto exit;
    }

    NL_NESTED_FOR_EACH (mc, left, family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
        static const struct nl_policy mc_policy[] = {
            [CTRL_ATTR_MCAST_GRP_ID] = {.type = NL_A_U32},
            [CTRL_ATTR_MCAST_GRP_NAME] = {.type = NL_A_STRING},
        };

        struct nlattr *mc_attrs[ARRAY_SIZE(mc_policy)];
        const char *mc_name;

        if (!nl_parse_nested(mc, mc_policy, mc_attrs, ARRAY_SIZE(mc_policy))) {
            error = EPROTO;
            goto exit;
        }

        mc_name = nl_attr_get_string(mc_attrs[CTRL_ATTR_MCAST_GRP_NAME]);
        if (!strcmp(group_name, mc_name)) {
            *multicast_group =
                nl_attr_get_u32(mc_attrs[CTRL_ATTR_MCAST_GRP_ID]);
            error = 0;
            goto exit;
        }
    }
    error = EPROTO;

exit:
    ofpbuf_delete(reply);
    return error;
}

/* If '*number' is 0, translates the given Generic Netlink family 'name' to a
 * number and stores it in '*number'.  If successful, returns 0 and the caller
 * may use '*number' as the family number.  On failure, returns a positive
 * errno value and '*number' caches the errno value. */
int
nl_lookup_genl_family(const char *name, int *number)
{
    if (*number == 0) {
        struct nlattr *attrs[ARRAY_SIZE(family_policy)];
        struct ofpbuf *reply;
        int error;

        error = do_lookup_genl_family(name, attrs, &reply);
        if (!error) {
            *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
            define_genl_family(*number, name);
        } else {
            *number = -error;
        }
        ofpbuf_delete(reply);

        ovs_assert(*number != 0);
    }
    return *number > 0 ? 0 : -*number;
}

struct nl_pool {
    struct nl_sock *socks[16];
    int n;
};

static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);

static int
nl_pool_alloc(int protocol, struct nl_sock **sockp)
{
    struct nl_sock *sock = NULL;
    struct nl_pool *pool;

    ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));

    ovs_mutex_lock(&pool_mutex);
    pool = &pools[protocol];
    if (pool->n > 0) {
        sock = pool->socks[--pool->n];
    }
    ovs_mutex_unlock(&pool_mutex);

    if (sock) {
        *sockp = sock;
        return 0;
    } else {
        return nl_sock_create(protocol, sockp);
    }
}

static void
nl_pool_release(struct nl_sock *sock)
{
    if (sock) {
        struct nl_pool *pool = &pools[sock->protocol];

        ovs_mutex_lock(&pool_mutex);
        if (pool->n < ARRAY_SIZE(pool->socks)) {
            pool->socks[pool->n++] = sock;
            sock = NULL;
        }
        ovs_mutex_unlock(&pool_mutex);

        nl_sock_destroy(sock);
    }
}

/* Sends 'request' to the kernel on a Netlink socket for the given 'protocol'
 * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response.  If
 * successful, returns 0.  On failure, returns a positive errno value.
 *
 * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
 * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
 * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the kernel's
 * reply, if any, is discarded.
 *
 * Before the message is sent, nlmsg_len in 'request' will be finalized to
 * match ofpbuf_size(msg), nlmsg_pid will be set to the pid of the socket used
 * for sending the request, and nlmsg_seq will be initialized.
 *
 * The caller is responsible for destroying 'request'.
 *
 * Bare Netlink is an unreliable transport protocol.  This function layers
 * reliable delivery and reply semantics on top of bare Netlink.
 *
 * In Netlink, sending a request to the kernel is reliable enough, because the
 * kernel will tell us if the message cannot be queued (and we will in that
 * case put it on the transmit queue and wait until it can be delivered).
 *
 * Receiving the reply is the real problem: if the socket buffer is full when
 * the kernel tries to send the reply, the reply will be dropped.  However, the
 * kernel sets a flag that a reply has been dropped.  The next call to recv
 * then returns ENOBUFS.  We can then re-send the request.
 *
 * Caveats:
 *
 *      1. Netlink depends on sequence numbers to match up requests and
 *         replies.  The sender of a request supplies a sequence number, and
 *         the reply echos back that sequence number.
 *
 *         This is fine, but (1) some kernel netlink implementations are
 *         broken, in that they fail to echo sequence numbers and (2) this
 *         function will drop packets with non-matching sequence numbers, so
 *         that only a single request can be usefully transacted at a time.
 *
 *      2. Resending the request causes it to be re-executed, so the request
 *         needs to be idempotent.
 */
int
nl_transact(int protocol, const struct ofpbuf *request,
            struct ofpbuf **replyp)
{
    struct nl_sock *sock;
    int error;

    error = nl_pool_alloc(protocol, &sock);
    if (error) {
        *replyp = NULL;
        return error;
    }

    error = nl_sock_transact(sock, request, replyp);

    nl_pool_release(sock);
    return error;
}

/* Sends the 'request' member of the 'n' transactions in 'transactions' on a
 * Netlink socket for the given 'protocol' (e.g. NETLINK_ROUTE or
 * NETLINK_GENERIC), in order, and receives responses to all of them.  Fills in
 * the 'error' member of each transaction with 0 if it was successful,
 * otherwise with a positive errno value.  If 'reply' is nonnull, then it will
 * be filled with the reply if the message receives a detailed reply.  In other
 * cases, i.e. where the request failed or had no reply beyond an indication of
 * success, 'reply' will be cleared if it is nonnull.
 *
 * The caller is responsible for destroying each request and reply, and the
 * transactions array itself.
 *
 * Before sending each message, this function will finalize nlmsg_len in each
 * 'request' to match the ofpbuf's size, set nlmsg_pid to the pid of the socket
 * used for the transaction, and initialize nlmsg_seq.
 *
 * Bare Netlink is an unreliable transport protocol.  This function layers
 * reliable delivery and reply semantics on top of bare Netlink.  See
 * nl_transact() for some caveats.
 */
void
nl_transact_multiple(int protocol,
                     struct nl_transaction **transactions, size_t n)
{
    struct nl_sock *sock;
    int error;

    error = nl_pool_alloc(protocol, &sock);
    if (!error) {
        nl_sock_transact_multiple(sock, transactions, n);
        nl_pool_release(sock);
    } else {
        nl_sock_record_errors__(transactions, n, error);
    }
}


static uint32_t
nl_sock_allocate_seq(struct nl_sock *sock, unsigned int n)
{
    uint32_t seq = sock->next_seq;

    sock->next_seq += n;

    /* Make it impossible for the next request for sequence numbers to wrap
     * around to 0.  Start over with 1 to avoid ever using a sequence number of
     * 0, because the kernel uses sequence number 0 for notifications. */
    if (sock->next_seq >= UINT32_MAX / 2) {
        sock->next_seq = 1;
    }

    return seq;
}

static void
nlmsghdr_to_string(const struct nlmsghdr *h, int protocol, struct ds *ds)
{
    struct nlmsg_flag {
        unsigned int bits;
        const char *name;
    };
    static const struct nlmsg_flag flags[] = {
        { NLM_F_REQUEST, "REQUEST" },
        { NLM_F_MULTI, "MULTI" },
        { NLM_F_ACK, "ACK" },
        { NLM_F_ECHO, "ECHO" },
        { NLM_F_DUMP, "DUMP" },
        { NLM_F_ROOT, "ROOT" },
        { NLM_F_MATCH, "MATCH" },
        { NLM_F_ATOMIC, "ATOMIC" },
    };
    const struct nlmsg_flag *flag;
    uint16_t flags_left;

    ds_put_format(ds, "nl(len:%"PRIu32", type=%"PRIu16,
                  h->nlmsg_len, h->nlmsg_type);
    if (h->nlmsg_type == NLMSG_NOOP) {
        ds_put_cstr(ds, "(no-op)");
    } else if (h->nlmsg_type == NLMSG_ERROR) {
        ds_put_cstr(ds, "(error)");
    } else if (h->nlmsg_type == NLMSG_DONE) {
        ds_put_cstr(ds, "(done)");
    } else if (h->nlmsg_type == NLMSG_OVERRUN) {
        ds_put_cstr(ds, "(overrun)");
    } else if (h->nlmsg_type < NLMSG_MIN_TYPE) {
        ds_put_cstr(ds, "(reserved)");
    } else if (protocol == NETLINK_GENERIC) {
        ds_put_format(ds, "(%s)", genl_family_to_name(h->nlmsg_type));
    } else {
        ds_put_cstr(ds, "(family-defined)");
    }
    ds_put_format(ds, ", flags=%"PRIx16, h->nlmsg_flags);
    flags_left = h->nlmsg_flags;
    for (flag = flags; flag < &flags[ARRAY_SIZE(flags)]; flag++) {
        if ((flags_left & flag->bits) == flag->bits) {
            ds_put_format(ds, "[%s]", flag->name);
            flags_left &= ~flag->bits;
        }
    }
    if (flags_left) {
        ds_put_format(ds, "[OTHER:%"PRIx16"]", flags_left);
    }
    ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32,
                  h->nlmsg_seq, h->nlmsg_pid);
}

static char *
nlmsg_to_string(const struct ofpbuf *buffer, int protocol)
{
    struct ds ds = DS_EMPTY_INITIALIZER;
    const struct nlmsghdr *h = ofpbuf_at(buffer, 0, NLMSG_HDRLEN);
    if (h) {
        nlmsghdr_to_string(h, protocol, &ds);
        if (h->nlmsg_type == NLMSG_ERROR) {
            const struct nlmsgerr *e;
            e = ofpbuf_at(buffer, NLMSG_HDRLEN,
                          NLMSG_ALIGN(sizeof(struct nlmsgerr)));
            if (e) {
                ds_put_format(&ds, " error(%d", e->error);
                if (e->error < 0) {
                    ds_put_format(&ds, "(%s)", ovs_strerror(-e->error));
                }
                ds_put_cstr(&ds, ", in-reply-to(");
                nlmsghdr_to_string(&e->msg, protocol, &ds);
                ds_put_cstr(&ds, "))");
            } else {
                ds_put_cstr(&ds, " error(truncated)");
            }
        } else if (h->nlmsg_type == NLMSG_DONE) {
            int *error = ofpbuf_at(buffer, NLMSG_HDRLEN, sizeof *error);
            if (error) {
                ds_put_format(&ds, " done(%d", *error);
                if (*error < 0) {
                    ds_put_format(&ds, "(%s)", ovs_strerror(-*error));
                }
                ds_put_cstr(&ds, ")");
            } else {
                ds_put_cstr(&ds, " done(truncated)");
            }
        } else if (protocol == NETLINK_GENERIC) {
            struct genlmsghdr *genl = nl_msg_genlmsghdr(buffer);
            if (genl) {
                ds_put_format(&ds, ",genl(cmd=%"PRIu8",version=%"PRIu8")",
                              genl->cmd, genl->version);
            }
        }
    } else {
        ds_put_cstr(&ds, "nl(truncated)");
    }
    return ds.string;
}

static void
log_nlmsg(const char *function, int error,
          const void *message, size_t size, int protocol)
{
    struct ofpbuf buffer;
    char *nlmsg;

    if (!VLOG_IS_DBG_ENABLED()) {
        return;
    }

    ofpbuf_use_const(&buffer, message, size);
    nlmsg = nlmsg_to_string(&buffer, protocol);
    VLOG_DBG_RL(&rl, "%s (%s): %s", function, ovs_strerror(error), nlmsg);
    free(nlmsg);
}
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								/*
-												netlink: Update comment for nl_dump_start().

The function comment still referred to a 'msg' variable, which has been
renamed.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
[blp@nicira.com did further proofreading]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-13 13:50:22 -08:00
+								 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 *
 								 * Licensed under the Apache License, Version 2.0 (the "License");
 								 * you may not use this file except in compliance with the License.
 								 * You may obtain a copy of the License at:
 								 *
 								 *     http://www.apache.org/licenses/LICENSE-2.0
 								 *
 								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS,
 								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								 * See the License for the specific language governing permissions and
 								 * limitations under the License.
 								 */
 								#include <config.h>
 								#include "netlink-socket.h"
 								#include <errno.h>
 								#include <inttypes.h>
 								#include <stdlib.h>
 								#include <sys/types.h>
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								#include <sys/uio.h>
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								#include <unistd.h>
 								#include "coverage.h"
 								#include "dynamic-string.h"
-												netlink-socket: Log Generic Netlink family names.

The ids for Generic Netlink family names aren't very helpful because they
can vary from machine to machine and even from one boot to the next.  So
this change logs their names too.

This only affects logging at DBG level.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-11 11:02:32 -08:00
+								#include "hash.h"
 								#include "hmap.h"
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								#include "netlink.h"
 								#include "netlink-protocol.h"
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								#include "odp-netlink.h"
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								#include "ofpbuf.h"
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
+								#include "ovs-thread.h"
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								#include "poll-loop.h"
-												netlink: Make nl_dump_next() thread-safe.

This patch modifies 'struct nl_dump' and nl_dump_next() to allow
multiple threads to share the same nl_dump. These changes are targeted
around synchronizing dump status between multiple callers, and
allowing callers to fully process their existing buffers before
determining whether to stop fetching flows.

The 'status' field of 'struct nl_dump' becomes atomic, so that multiple
threads may check and/or update it to communicate when there is an error
or the netlink dump is finished. The low bit holds whether the final
message was seen, while the higher bits hold an errno value.

nl_dump_next() will now read all messages from the given buffer before
checking the shared error status and attempting to fetch more. Multiple
threads may call this with the same nl_dump, but must provide
independent buffers. As previously, the final dump status can be
determined by calling nl_dump_done() from a single thread.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:06 -08:00
+								#include "seq.h"
-												netlink-socket: New function for draining the receive buffer.

This will be used in an upcoming patch.

Reviewed by Justin Pettit.

											
										
										
											2011-01-11 16:05:37 -08:00
+								#include "socket-util.h"
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								#include "util.h"
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								#include "vlog.h"
 								VLOG_DEFINE_THIS_MODULE(netlink_socket);
 								COVERAGE_DEFINE(netlink_overflow);
 								COVERAGE_DEFINE(netlink_received);
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								COVERAGE_DEFINE(netlink_recv_jumbo);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								COVERAGE_DEFINE(netlink_sent);
 								/* Linux header file confusion causes this to be undefined. */
 								#ifndef SOL_NETLINK
 								#define SOL_NETLINK 270
 								#endif
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								static struct ovs_mutex portid_mutex = OVS_MUTEX_INITIALIZER;
 								static uint32_t g_last_portid = 0;
 								/* Port IDs must be unique! */
 								static uint32_t
 								portid_next(void)
 								    OVS_GUARDED_BY(portid_mutex)
 								{
 								    g_last_portid++;
 								    return g_last_portid;
 								}
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								#endif /* _WIN32 */
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								/* A single (bad) Netlink message can in theory dump out many, many log
 								 * messages, so the burst size is set quite high here to avoid missing useful
 								 * information.  Also, at high logging levels we log *all* Netlink messages. */
 								static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);
-												netlink: Postpone choosing sequence numbers until send time.

Choosing sequence numbers at time of creating a packet means that
nl_sock_transact_multiple() has to search for the sequence number
of a reply, because the sequence numbers of the requests aren't
necessarily sequential.  This commit makes it possible to avoid
the search, by deferring choice of sequence numbers until the
time that we send the packets.  It doesn't actually modify
nl_sock_transact_multiple(), which will happen in a later commit.

Previously, I was concerned about a theoretical race condition
described in a comment in the old versino of this code:

    This implementation uses sequence numbers that are unique
    process-wide, to avoid a hypothetical race: send request, close
    socket, open new socket that reuses the old socket's PID value,
    send request on new socket, receive reply from kernel to old
    socket but with same PID and sequence number.  (This race could be
    avoided other ways, e.g. by preventing PIDs from being quickly
    reused).

However, I no longer believe that this can be a real problem,
because Netlink operates synchronously.  The reply to a request
will always arrive before the socket can be closed and a new
socket opened with the old socket's PID.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-16 16:01:01 -07:00
+								static uint32_t nl_sock_allocate_seq(struct nl_sock *, unsigned int n);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								static void log_nlmsg(const char *function, int error,
-												netlink-socket: Slightly improve logging of Generic Netlink messages.

This makes the stream of requests and replies very slightly easier to
understand.

Reviewed by Justin Pettit.

											
										
										
											2011-01-18 14:07:52 -08:00
+								                      const void *message, size_t size, int protocol);
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								#ifdef _WIN32
-												netlink-socket: fix typo to get_sock_pid_from_kernel()

A typo crept in while respinning get_sock_pid_from_kernel() in the previous
patch. Fixing it now. Also, get_sock_pid_from_kernel() doesn't need an OUT
argument. Fixing that too.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:20:51 -07:00
+								static int get_sock_pid_from_kernel(struct nl_sock *sock);
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								#endif
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
 								/* Netlink sockets. */
-												netlink-socket: Minor style fix.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-09 11:18:27 -07:00
+								struct nl_sock {
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    HANDLE handle;
 								#else
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    int fd;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink: Postpone choosing sequence numbers until send time.

Choosing sequence numbers at time of creating a packet means that
nl_sock_transact_multiple() has to search for the sequence number
of a reply, because the sequence numbers of the requests aren't
necessarily sequential.  This commit makes it possible to avoid
the search, by deferring choice of sequence numbers until the
time that we send the packets.  It doesn't actually modify
nl_sock_transact_multiple(), which will happen in a later commit.

Previously, I was concerned about a theoretical race condition
described in a comment in the old versino of this code:

    This implementation uses sequence numbers that are unique
    process-wide, to avoid a hypothetical race: send request, close
    socket, open new socket that reuses the old socket's PID value,
    send request on new socket, receive reply from kernel to old
    socket but with same PID and sequence number.  (This race could be
    avoided other ways, e.g. by preventing PIDs from being quickly
    reused).

However, I no longer believe that this can be a real problem,
because Netlink operates synchronously.  The reply to a request
will always arrive before the socket can be closed and a new
socket opened with the old socket's PID.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-16 16:01:01 -07:00
+								    uint32_t next_seq;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    uint32_t pid;
-												netlink-socket: Slightly improve logging of Generic Netlink messages.

This makes the stream of requests and replies very slightly easier to
understand.

Reviewed by Justin Pettit.

											
										
										
											2011-01-18 14:07:52 -08:00
+								    int protocol;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								};
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								/* Compile-time limit on iovecs, so that we can allocate a maximum-size array
 								 * of iovecs on the stack. */
 								#define MAX_IOVS 128
 								/* Maximum number of iovecs that may be passed to sendmsg, capped at a
 								 * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
 								 *
 								 * Initialized by nl_sock_create(). */
 								static int max_iovs;
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
 								static void nl_pool_release(struct nl_sock *);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
 								/* Creates a new netlink socket for the given netlink 'protocol'
 								 * (NETLINK_ROUTE, NETLINK_GENERIC, ...).  Returns 0 and sets '*sockp' to the
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								 * new socket if successful, otherwise returns a positive errno value. */
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								int
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								nl_sock_create(int protocol, struct nl_sock **sockp)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
+								    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    struct nl_sock *sock;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifndef _WIN32
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    struct sockaddr_nl local, remote;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Let the kernel choose Netlink pids for us.

The Netlink code in the Linux kernel has been willing to choose unique
Netlink pids for userspace sockets since at least 2.4.36 and probably
earlier.  There's no value in choosing them ourselves.

This simplifies the code and eliminates the possibility of exhausting our
supply of Netlink PIDs.

											
										
										
											2011-11-14 10:10:58 -08:00
+								    socklen_t local_size;
-												netlink-socket: Increase Netlink socket receive buffer size.

Open vSwitch userspace can set up flows at a high rate, but it is somewhat
"bursty" in opportunities to set up flows, by which I mean that OVS sets up
a batch of flows, then goes off and does some other work for a while, then
sets up another batch of flows, and so on.  The result is that, if a large
number of packets that need flow setups come in all at once, then some of
them can overflow the relatively small kernel-to-user buffers.

This commit increases the kernel-to-user buffers from the default of
approximately 120 kB each to 1 MB each.  In one somewhat synthetic test
case that I ran based on an "hping3" that generated a load of about 20,000
new flows per second (including both requests and replies), this reduced
the packets dropped at the kernel-to-user interface from about 30% to none.
I expect that it will similarly improve packet loss in workloads where
flow arrival is not easily predictable.

(This has little effect on workloads generated by "ovs-benchmark rate"
because that benchmark is effectively "self-clocking", that is, a new flow
is triggered only by a reply to a request made earlier, which means that
the number of buffered packets at any given has a known, constant upper
limit.)

Bug #10210.
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-03-15 21:15:38 -07:00
+								    int rcvbuf;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    int retval = 0;
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
+								    if (ovsthread_once_start(&once)) {
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        int save_errno = errno;
 								        errno = 0;
 								        max_iovs = sysconf(_SC_UIO_MAXIOV);
 								        if (max_iovs < _XOPEN_IOV_MAX) {
 								            if (max_iovs == -1 && errno) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", ovs_strerror(errno));
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								            }
 								            max_iovs = _XOPEN_IOV_MAX;
 								        } else if (max_iovs > MAX_IOVS) {
 								            max_iovs = MAX_IOVS;
 								        }
 								        errno = save_errno;
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
+								        ovsthread_once_done(&once);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    }
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    *sockp = NULL;
-												netlink-socket: Use xmalloc() instead of malloc().

This was the only obvious use of bare malloc() in the tree, other
than in the implementation of wrapper functions.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-26 14:15:37 -07:00
+								    sock = xmalloc(sizeof *sock);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    sock->handle = CreateFileA("\\\\.\\OpenVSwitchDevice",
 								                               GENERIC_READ | GENERIC_WRITE,
 								                               FILE_SHARE_READ | FILE_SHARE_WRITE,
 								                               NULL, OPEN_EXISTING,
 								                               FILE_ATTRIBUTE_NORMAL, NULL);
 								    int last_error = GetLastError();
 								    if (sock->handle == INVALID_HANDLE_VALUE) {
 								        VLOG_ERR("fcntl: %s", ovs_strerror(last_error));
 								        goto error;
 								    }
 								#else
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
 								    if (sock->fd < 0) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_ERR("fcntl: %s", ovs_strerror(errno));
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        goto error;
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Slightly improve logging of Generic Netlink messages.

This makes the stream of requests and replies very slightly easier to
understand.

Reviewed by Justin Pettit.

											
										
										
											2011-01-18 14:07:52 -08:00
+								    sock->protocol = protocol;
-												netlink: Postpone choosing sequence numbers until send time.

Choosing sequence numbers at time of creating a packet means that
nl_sock_transact_multiple() has to search for the sequence number
of a reply, because the sequence numbers of the requests aren't
necessarily sequential.  This commit makes it possible to avoid
the search, by deferring choice of sequence numbers until the
time that we send the packets.  It doesn't actually modify
nl_sock_transact_multiple(), which will happen in a later commit.

Previously, I was concerned about a theoretical race condition
described in a comment in the old versino of this code:

    This implementation uses sequence numbers that are unique
    process-wide, to avoid a hypothetical race: send request, close
    socket, open new socket that reuses the old socket's PID value,
    send request on new socket, receive reply from kernel to old
    socket but with same PID and sequence number.  (This race could be
    avoided other ways, e.g. by preventing PIDs from being quickly
    reused).

However, I no longer believe that this can be a real problem,
because Netlink operates synchronously.  The reply to a request
will always arrive before the socket can be closed and a new
socket opened with the old socket's PID.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-16 16:01:01 -07:00
+								    sock->next_seq = 1;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
-												netlink-socket: Increase Netlink socket receive buffer size.

Open vSwitch userspace can set up flows at a high rate, but it is somewhat
"bursty" in opportunities to set up flows, by which I mean that OVS sets up
a batch of flows, then goes off and does some other work for a while, then
sets up another batch of flows, and so on.  The result is that, if a large
number of packets that need flow setups come in all at once, then some of
them can overflow the relatively small kernel-to-user buffers.

This commit increases the kernel-to-user buffers from the default of
approximately 120 kB each to 1 MB each.  In one somewhat synthetic test
case that I ran based on an "hping3" that generated a load of about 20,000
new flows per second (including both requests and replies), this reduced
the packets dropped at the kernel-to-user interface from about 30% to none.
I expect that it will similarly improve packet loss in workloads where
flow arrival is not easily predictable.

(This has little effect on workloads generated by "ovs-benchmark rate"
because that benchmark is effectively "self-clocking", that is, a new flow
is triggered only by a reply to a request made earlier, which means that
the number of buffered packets at any given has a known, constant upper
limit.)

Bug #10210.
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-03-15 21:15:38 -07:00
+								    rcvbuf = 1024 * 1024;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    sock->rcvbuf = rcvbuf;
-												netlink-socket: fix typo to get_sock_pid_from_kernel()

A typo crept in while respinning get_sock_pid_from_kernel() in the previous
patch. Fixing it now. Also, get_sock_pid_from_kernel() doesn't need an OUT
argument. Fixing that too.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:20:51 -07:00
+								    retval = get_sock_pid_from_kernel(sock);
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								    if (retval != 0) {
 								        goto error;
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#else
-												netlink-socket: Increase Netlink socket receive buffer size.

Open vSwitch userspace can set up flows at a high rate, but it is somewhat
"bursty" in opportunities to set up flows, by which I mean that OVS sets up
a batch of flows, then goes off and does some other work for a while, then
sets up another batch of flows, and so on.  The result is that, if a large
number of packets that need flow setups come in all at once, then some of
them can overflow the relatively small kernel-to-user buffers.

This commit increases the kernel-to-user buffers from the default of
approximately 120 kB each to 1 MB each.  In one somewhat synthetic test
case that I ran based on an "hping3" that generated a load of about 20,000
new flows per second (including both requests and replies), this reduced
the packets dropped at the kernel-to-user interface from about 30% to none.
I expect that it will similarly improve packet loss in workloads where
flow arrival is not easily predictable.

(This has little effect on workloads generated by "ovs-benchmark rate"
because that benchmark is effectively "self-clocking", that is, a new flow
is triggered only by a reply to a request made earlier, which means that
the number of buffered packets at any given has a known, constant upper
limit.)

Bug #10210.
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-03-15 21:15:38 -07:00
+								    if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUFFORCE,
 								                   &rcvbuf, sizeof rcvbuf)) {
-												netlink-socket: Don't bother logging SO_RCVBUFFORCE failure as non-root.

Some Open vSwitch utilities can do useful work when they are not run as
root.  Without this commit, these utilities will log a warning on failure
to use the SO_RCVBUFFORCE socket option if they open any Netlink sockets.
This will always happen, it does not report anything unexpected or
fixable as non-root, and sometimes it makes users wonder if something is
wrong, so there is no benefit to logging it.  This commit drops it in that
case.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-08-17 15:40:03 -07:00
+								        /* Only root can use SO_RCVBUFFORCE.  Everyone else gets EPERM.
 								         * Warn only if the failure is therefore unexpected. */
-												netlink-socket: Don't bother logging SO_RCVBUFFORCE failure

This patch fixes tests when they are run with "fakeroot debian/rules binary"
command.

The problem was that under fakeroot setsockopt() call could still return
EPERM and lead to a warning message being logged.

Signed-off-by: Ansis Atteka <aatteka@nicira.com>

											
										
										
											2013-04-11 11:33:24 -07:00
+								        if (errno != EPERM) {
-												netlink-socket: Don't bother logging SO_RCVBUFFORCE failure as non-root.

Some Open vSwitch utilities can do useful work when they are not run as
root.  Without this commit, these utilities will log a warning on failure
to use the SO_RCVBUFFORCE socket option if they open any Netlink sockets.
This will always happen, it does not report anything unexpected or
fixable as non-root, and sometimes it makes users wonder if something is
wrong, so there is no benefit to logging it.  This commit drops it in that
case.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-08-17 15:40:03 -07:00
+								            VLOG_WARN_RL(&rl, "setting %d-byte socket receive buffer failed "
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                         "(%s)", rcvbuf, ovs_strerror(errno));
-												netlink-socket: Don't bother logging SO_RCVBUFFORCE failure as non-root.

Some Open vSwitch utilities can do useful work when they are not run as
root.  Without this commit, these utilities will log a warning on failure
to use the SO_RCVBUFFORCE socket option if they open any Netlink sockets.
This will always happen, it does not report anything unexpected or
fixable as non-root, and sometimes it makes users wonder if something is
wrong, so there is no benefit to logging it.  This commit drops it in that
case.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-08-17 15:40:03 -07:00
+								        }
-												netlink-socket: Increase Netlink socket receive buffer size.

Open vSwitch userspace can set up flows at a high rate, but it is somewhat
"bursty" in opportunities to set up flows, by which I mean that OVS sets up
a batch of flows, then goes off and does some other work for a while, then
sets up another batch of flows, and so on.  The result is that, if a large
number of packets that need flow setups come in all at once, then some of
them can overflow the relatively small kernel-to-user buffers.

This commit increases the kernel-to-user buffers from the default of
approximately 120 kB each to 1 MB each.  In one somewhat synthetic test
case that I ran based on an "hping3" that generated a load of about 20,000
new flows per second (including both requests and replies), this reduced
the packets dropped at the kernel-to-user interface from about 30% to none.
I expect that it will similarly improve packet loss in workloads where
flow arrival is not easily predictable.

(This has little effect on workloads generated by "ovs-benchmark rate"
because that benchmark is effectively "self-clocking", that is, a new flow
is triggered only by a reply to a request made earlier, which means that
the number of buffered packets at any given has a known, constant upper
limit.)

Bug #10210.
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-03-15 21:15:38 -07:00
+								    }
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    retval = get_socket_rcvbuf(sock->fd);
 								    if (retval < 0) {
 								        retval = -retval;
 								        goto error;
 								    }
 								    sock->rcvbuf = retval;
-												netlink-socket: Let the kernel choose Netlink pids for us.

The Netlink code in the Linux kernel has been willing to choose unique
Netlink pids for userspace sockets since at least 2.4.36 and probably
earlier.  There's no value in choosing them ourselves.

This simplifies the code and eliminates the possibility of exhausting our
supply of Netlink PIDs.

											
										
										
											2011-11-14 10:10:58 -08:00
+								    /* Connect to kernel (pid 0) as remote address. */
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    memset(&remote, 0, sizeof remote);
 								    remote.nl_family = AF_NETLINK;
 								    remote.nl_pid = 0;
 								    if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_ERR("connect(0): %s", ovs_strerror(errno));
-												netlink-socket: Let the kernel choose Netlink pids for us.

The Netlink code in the Linux kernel has been willing to choose unique
Netlink pids for userspace sockets since at least 2.4.36 and probably
earlier.  There's no value in choosing them ourselves.

This simplifies the code and eliminates the possibility of exhausting our
supply of Netlink PIDs.

											
										
										
											2011-11-14 10:10:58 -08:00
+								        goto error;
 								    }
 								    /* Obtain pid assigned by kernel. */
 								    local_size = sizeof local;
 								    if (getsockname(sock->fd, (struct sockaddr *) &local, &local_size) < 0) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_ERR("getsockname: %s", ovs_strerror(errno));
-												netlink-socket: Let the kernel choose Netlink pids for us.

The Netlink code in the Linux kernel has been willing to choose unique
Netlink pids for userspace sockets since at least 2.4.36 and probably
earlier.  There's no value in choosing them ourselves.

This simplifies the code and eliminates the possibility of exhausting our
supply of Netlink PIDs.

											
										
										
											2011-11-14 10:10:58 -08:00
+								        goto error;
 								    }
 								    if (local_size < sizeof local || local.nl_family != AF_NETLINK) {
 								        VLOG_ERR("getsockname returned bad Netlink name");
 								        retval = EINVAL;
 								        goto error;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
-												netlink-socket: Let the kernel choose Netlink pids for us.

The Netlink code in the Linux kernel has been willing to choose unique
Netlink pids for userspace sockets since at least 2.4.36 and probably
earlier.  There's no value in choosing them ourselves.

This simplifies the code and eliminates the possibility of exhausting our
supply of Netlink PIDs.

											
										
										
											2011-11-14 10:10:58 -08:00
+								    sock->pid = local.nl_pid;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
 								    *sockp = sock;
 								    return 0;
 								error:
 								    if (retval == 0) {
 								        retval = errno;
 								        if (retval == 0) {
 								            retval = EINVAL;
 								        }
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    if (sock->handle != INVALID_HANDLE_VALUE) {
 								        CloseHandle(sock->handle);
 								    }
 								#else
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    if (sock->fd >= 0) {
 								        close(sock->fd);
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    free(sock);
 								    return retval;
 								}
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								/* Creates a new netlink socket for the same protocol as 'src'.  Returns 0 and
 								 * sets '*sockp' to the new socket if successful, otherwise returns a positive
 								 * errno value.  */
 								int
 								nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp)
 								{
 								    return nl_sock_create(src->protocol, sockp);
 								}
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								/* Destroys netlink socket 'sock'. */
 								void
 								nl_sock_destroy(struct nl_sock *sock)
 								{
 								    if (sock) {
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								        CloseHandle(sock->handle);
 								#else
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								        close(sock->fd);
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								        free(sock);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
 								}
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								#ifdef _WIN32
 								/* Reads the pid for 'sock' generated in the kernel datapath. The function
 								 * follows a transaction semantic. Eventually this function should call into
 								 * nl_transact. */
 								static int
-												netlink-socket: fix typo to get_sock_pid_from_kernel()

A typo crept in while respinning get_sock_pid_from_kernel() in the previous
patch. Fixing it now. Also, get_sock_pid_from_kernel() doesn't need an OUT
argument. Fixing that too.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:20:51 -07:00
+								get_sock_pid_from_kernel(struct nl_sock *sock)
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								{
 								    struct nl_transaction txn;
 								    struct ofpbuf request;
 								    uint64_t request_stub[128];
 								    struct ofpbuf reply;
 								    uint64_t reply_stub[128];
 								    struct ovs_header *ovs_header;
 								    struct nlmsghdr *nlmsg;
 								    uint32_t seq;
 								    int retval;
 								    DWORD bytes;
 								    int ovs_msg_size = sizeof (struct nlmsghdr) + sizeof (struct genlmsghdr) +
 								                       sizeof (struct ovs_header);
 								    ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
 								    txn.request = &request;
 								    ofpbuf_use_stub(&reply, reply_stub, sizeof reply_stub);
 								    txn.reply = &reply;
 								    seq = nl_sock_allocate_seq(sock, 1);
 								    nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
 								                          OVS_CTRL_CMD_WIN_GET_PID, OVS_WIN_CONTROL_VERSION);
 								    nlmsg = nl_msg_nlmsghdr(txn.request);
 								    nlmsg->nlmsg_seq = seq;
 								    ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
 								    ovs_header->dp_ifindex = 0;
 								    ovs_header = ofpbuf_put_uninit(&reply, ovs_msg_size);
 								    if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
 								                         ofpbuf_data(txn.request), ofpbuf_size(txn.request),
 								                         ofpbuf_data(txn.reply), ofpbuf_size(txn.reply),
 								                         &bytes, NULL)) {
 								        retval = EINVAL;
 								        goto done;
 								    } else {
 								        if (bytes < ovs_msg_size) {
 								            retval = EINVAL;
 								            goto done;
 								        }
-												netlink-socket: fix typo to get_sock_pid_from_kernel()

A typo crept in while respinning get_sock_pid_from_kernel() in the previous
patch. Fixing it now. Also, get_sock_pid_from_kernel() doesn't need an OUT
argument. Fixing that too.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:20:51 -07:00
+								        nlmsg = nl_msg_nlmsghdr(txn.reply);
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								        if (nlmsg->nlmsg_seq != seq) {
 								            retval = EINVAL;
 								            goto done;
 								        }
-												netlink-socket: fix typo to get_sock_pid_from_kernel()

A typo crept in while respinning get_sock_pid_from_kernel() in the previous
patch. Fixing it now. Also, get_sock_pid_from_kernel() doesn't need an OUT
argument. Fixing that too.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:20:51 -07:00
+								        sock->pid = nlmsg->nlmsg_pid;
-												netlink-socket.c: implement get pid support on Windows

To verify if the netlink support in the kernel works, I updated
the netlink-socket.c code to get the PID for a given device
descriptor.

In the existing code, userspace sets the PID, which will not be
unique across different processes. So, it is better for the
kernel to generate the PID and give it back to userspace.

dpif-linux.c was ported to Windows (similar to Alin's change in
the cloudbase repo) and was able to exercise the code changes
in netlink-socket.c to read the PID. dpif-linux.c changes are
not being checked in.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Acked-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Ankur Sharma <ankursharma@vmware.com>
Acked-by: Saurabh Shah <ssaurabh@vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/18
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-19 13:51:54 -07:00
+								    }
 								    retval = 0;
 								done:
 								    ofpbuf_uninit(&request);
 								    ofpbuf_uninit(&reply);
 								    return retval;
 								}
 								#endif  /* _WIN32 */
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								/* Tries to add 'sock' as a listener for 'multicast_group'.  Returns 0 if
 								 * successful, otherwise a positive errno value.
 								 *
-												netlink-socket: Async notifications are incompatible with other operations.

A Netlink socket that receives asynchronous notifications (e.g. from a
multicast group) cannot be used for transactions or dumps, because those
operations would discard asynchronous messages that arrive while waiting
for replies.

This commit documents this issue in a comment on nl_sock_join_mcgroup().
It also removes an internal attempt to avoid mixing multicast reception
with other operations.  The attempt was incomplete, because it only
handled dumps even though ordinary transactions are also problematic.  It
seems better to remove it than to fix it because, first, all of the
existing users in OVS already separate multicast reception from other
operations and, second, an upcoming commit will start using unicast
Netlink for asynchronous notifications, which has the same issues but
doesn't use nl_sock_join_mcgroup().

											
										
										
											2011-09-22 11:36:39 -07:00
+								 * A socket that is subscribed to a multicast group that receives asynchronous
 								 * notifications must not be used for Netlink transactions or dumps, because
 								 * transactions and dumps can cause notifications to be lost.
 								 *
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								 * Multicast group numbers are always positive.
 								 *
 								 * It is not an error to attempt to join a multicast group to which a socket
 								 * already belongs. */
 								int
 								nl_sock_join_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
 								{
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								#define OVS_VPORT_MCGROUP_FALLBACK_ID 33
 								    struct ofpbuf msg_buf;
 								    struct message_multicast
 								    {
 								        struct nlmsghdr;
 								        /* if true, join; if else, leave */
 								        unsigned char join;
 								        unsigned int groupId;
 								    };
 								    struct message_multicast msg = { 0 };
 								    msg.nlmsg_len = sizeof(struct message_multicast);
 								    msg.nlmsg_type = OVS_VPORT_MCGROUP_FALLBACK_ID;
 								    msg.nlmsg_flags = 0;
 								    msg.nlmsg_seq = 0;
 								    msg.nlmsg_pid = sock->pid;
 								    msg.join = 1;
 								    msg.groupId = multicast_group;
 								    msg_buf.base_ = &msg;
 								    msg_buf.data_ = &msg;
 								    msg_buf.size_ = msg.nlmsg_len;
 								    nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
 								#else
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								    if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
 								                   &multicast_group, sizeof multicast_group) < 0) {
 								        VLOG_WARN("could not join multicast group %u (%s)",
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                  multicast_group, ovs_strerror(errno));
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								        return errno;
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								    return 0;
 								}
 								/* Tries to make 'sock' stop listening to 'multicast_group'.  Returns 0 if
 								 * successful, otherwise a positive errno value.
 								 *
 								 * Multicast group numbers are always positive.
 								 *
 								 * It is not an error to attempt to leave a multicast group to which a socket
 								 * does not belong.
 								 *
 								 * On success, reading from 'sock' will still return any messages that were
 								 * received on 'multicast_group' before the group was left. */
 								int
 								nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
 								{
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    struct ofpbuf msg_buf;
 								    struct message_multicast
 								    {
 								        struct nlmsghdr;
 								        /* if true, join; if else, leave*/
 								        unsigned char join;
 								    };
 								    struct message_multicast msg = { 0 };
 								    nl_msg_put_nlmsghdr(&msg, sizeof(struct message_multicast),
 								                        multicast_group, 0);
 								    msg.join = 0;
 								    msg_buf.base_ = &msg;
 								    msg_buf.data_ = &msg;
 								    msg_buf.size_ = msg.nlmsg_len;
 								    nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
 								#else
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								    if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_DROP_MEMBERSHIP,
 								                   &multicast_group, sizeof multicast_group) < 0) {
 								        VLOG_WARN("could not leave multicast group %u (%s)",
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                  multicast_group, ovs_strerror(errno));
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								        return errno;
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Add functions for joining and leaving multicast groups.

When this library was originally implemented, support for Linux 2.4 was
important.  The Netlink implementation in Linux only added support for
joining and leaving multicast groups after a socket is bound as of Linux
2.6.14, so the library did not support it either.  But the current version
of Open vSwitch targets Linux 2.6.18 and over, so it's fine to add this
support now, and this commit does so.

This will be used more extensively in upcoming commits.

Reviewed by Justin Pettit.

											
										
										
											2011-01-09 16:57:45 -08:00
+								    return 0;
 								}
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								static int
-												ovs-brcompatd: Fix sending replies to kernel requests.

Commit 7d7447 (netlink: Postpone choosing sequence numbers until send
time.) broke ovs-brcompatd because it prevented userspace replies to
kernel requests from using the correct sequence numbers.  This commit fixes
it.

Atzm Watanabe found the root cause and provided an alternative patch to
avoid the problem.

Reported-by: André Ruß <andre.russ@hybris.com>
Reported-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Tested-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-07-05 08:41:03 -07:00
+								nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
 								               uint32_t nlmsg_seq, bool wait)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
 								    struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(msg);
 								    int error;
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								    nlmsg->nlmsg_len = ofpbuf_size(msg);
-												ovs-brcompatd: Fix sending replies to kernel requests.

Commit 7d7447 (netlink: Postpone choosing sequence numbers until send
time.) broke ovs-brcompatd because it prevented userspace replies to
kernel requests from using the correct sequence numbers.  This commit fixes
it.

Atzm Watanabe found the root cause and provided an alternative patch to
avoid the problem.

Reported-by: André Ruß <andre.russ@hybris.com>
Reported-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Tested-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-07-05 08:41:03 -07:00
+								    nlmsg->nlmsg_seq = nlmsg_seq;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    nlmsg->nlmsg_pid = sock->pid;
 								    do {
 								        int retval;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								        bool result;
 								        DWORD last_error = 0;
 								        result = WriteFile(sock->handle, ofpbuf_data(msg), ofpbuf_size(msg),
 								                           &retval, NULL);
 								        last_error = GetLastError();
 								        if (last_error != ERROR_SUCCESS && !result) {
 								            retval = -1;
 								            errno = EAGAIN;
 								        }
 								#else
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								        retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        error = retval < 0 ? errno : 0;
 								    } while (error == EINTR);
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								    log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    if (!error) {
 								        COVERAGE_INC(netlink_sent);
 								    }
 								    return error;
 								}
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								/* Tries to send 'msg', which must contain a Netlink message, to the kernel on
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								 * 'sock'.  nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
-												ovs-brcompatd: Fix sending replies to kernel requests.

Commit 7d7447 (netlink: Postpone choosing sequence numbers until send
time.) broke ovs-brcompatd because it prevented userspace replies to
kernel requests from using the correct sequence numbers.  This commit fixes
it.

Atzm Watanabe found the root cause and provided an alternative patch to
avoid the problem.

Reported-by: André Ruß <andre.russ@hybris.com>
Reported-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Tested-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-07-05 08:41:03 -07:00
+								 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to a fresh
 								 * sequence number, before the message is sent.
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								 *
 								 * Returns 0 if successful, otherwise a positive errno value.  If
 								 * 'wait' is true, then the send will wait until buffer space is ready;
 								 * otherwise, returns EAGAIN if the 'sock' send buffer is full. */
 								int
 								nl_sock_send(struct nl_sock *sock, const struct ofpbuf *msg, bool wait)
-												ovs-brcompatd: Fix sending replies to kernel requests.

Commit 7d7447 (netlink: Postpone choosing sequence numbers until send
time.) broke ovs-brcompatd because it prevented userspace replies to
kernel requests from using the correct sequence numbers.  This commit fixes
it.

Atzm Watanabe found the root cause and provided an alternative patch to
avoid the problem.

Reported-by: André Ruß <andre.russ@hybris.com>
Reported-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Tested-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-07-05 08:41:03 -07:00
+								{
 								    return nl_sock_send_seq(sock, msg, nl_sock_allocate_seq(sock, 1), wait);
 								}
 								/* Tries to send 'msg', which must contain a Netlink message, to the kernel on
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								 * 'sock'.  nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
-												ovs-brcompatd: Fix sending replies to kernel requests.

Commit 7d7447 (netlink: Postpone choosing sequence numbers until send
time.) broke ovs-brcompatd because it prevented userspace replies to
kernel requests from using the correct sequence numbers.  This commit fixes
it.

Atzm Watanabe found the root cause and provided an alternative patch to
avoid the problem.

Reported-by: André Ruß <andre.russ@hybris.com>
Reported-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Tested-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-07-05 08:41:03 -07:00
+								 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to
 								 * 'nlmsg_seq', before the message is sent.
 								 *
 								 * Returns 0 if successful, otherwise a positive errno value.  If
 								 * 'wait' is true, then the send will wait until buffer space is ready;
 								 * otherwise, returns EAGAIN if the 'sock' send buffer is full.
 								 *
 								 * This function is suitable for sending a reply to a request that was received
 								 * with sequence number 'nlmsg_seq'.  Otherwise, use nl_sock_send() instead. */
 								int
 								nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg,
 								                 uint32_t nlmsg_seq, bool wait)
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								{
-												ovs-brcompatd: Fix sending replies to kernel requests.

Commit 7d7447 (netlink: Postpone choosing sequence numbers until send
time.) broke ovs-brcompatd because it prevented userspace replies to
kernel requests from using the correct sequence numbers.  This commit fixes
it.

Atzm Watanabe found the root cause and provided an alternative patch to
avoid the problem.

Reported-by: André Ruß <andre.russ@hybris.com>
Reported-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Tested-by: Atzm Watanabe <atzm@stratosphere.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-07-05 08:41:03 -07:00
+								    return nl_sock_send__(sock, msg, nlmsg_seq, wait);
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								}
 								static int
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    /* We can't accurately predict the size of the data to be received.  The
 								     * caller is supposed to have allocated enough space in 'buf' to handle the
 								     * "typical" case.  To handle exceptions, we make available enough space in
 								     * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
 								     * figure since that's the maximum length of a Netlink attribute). */
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    struct nlmsghdr *nlmsghdr;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								#define MAX_STACK_LENGTH 81920
 								    uint8_t tail[MAX_STACK_LENGTH];
 								#else
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    uint8_t tail[65536];
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								    struct iovec iov[2];
 								    struct msghdr msg;
 								    ssize_t retval;
-												netlink-socket: Work around upstream kernel Netlink bug.

The upstream kernel net/netlink/af_netlink.c netlink_recvmsg() contains the
following code to refill the Netlink socket buffer with more dump skbs
while a dump is in progress:

	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}

The netlink_dump() function that this calls returns a negative number on
error, the convention used throughout the kernel, and thus sk->sk_err
receives a negative value on error.

However, sk->sk_err is supposed to contain either 0 or a positive errno
value, as one can see from a quick "grep" through net for 'sk_err =', e.g.:

    ipv4/tcp.c:2067:		sk->sk_err = ECONNRESET;
    ipv4/tcp.c:2069:		sk->sk_err = ECONNRESET;
    ipv4/tcp_input.c:4106:		sk->sk_err = ECONNREFUSED;
    ipv4/tcp_input.c:4109:		sk->sk_err = EPIPE;
    ipv4/tcp_input.c:4114:		sk->sk_err = ECONNRESET;
    netlink/af_netlink.c:741:			sk->sk_err = ENOBUFS;
    netlink/af_netlink.c:1796:			sk->sk_err = ENOBUFS;
    packet/af_packet.c:2476:		sk->sk_err = ENETDOWN;
    unix/af_unix.c:341:			other->sk_err = ECONNRESET;
    unix/af_unix.c:407:				skpair->sk_err = ECONNRESET;

The result is that the next attempt to receive from the socket will return
the error to userspace with the wrong sign.

(The root of the error in this case is that multiple threads are attempting
to read a single flow dump from a shared fd.  That should work, but the
kernel has an internal race that can result in one or more of those threads
hitting the EINVAL case at the start of netlink_dump().  The EINVAL is
harmless in this case and userspace should be able to ignore it, but
reporting the EINVAL as if it were a 22-byte message received in userspace
throws a real wrench in the works.)

This bug makes me think that there are probably not many programs doing
multithreaded Netlink dumps.  Maybe it is good that we are considering
other approaches.

VMware-BZ: #1255704
Reported-by: Mihir Gangar <gangarm@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-06-30 14:57:42 -07:00
+								    int error;
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
-												Replace most uses of assert by ovs_assert.

This is a straight search-and-replace, except that I also removed #include
<assert.h> from each file where there were no assert calls left.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-06 13:14:55 -08:00
+								    ovs_assert(buf->allocated >= sizeof *nlmsghdr);
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    ofpbuf_clear(buf);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								    iov[0].iov_base = ofpbuf_base(buf);
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    iov[0].iov_len = buf->allocated;
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								    iov[1].iov_base = tail;
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    iov[1].iov_len = sizeof tail;
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
 								    memset(&msg, 0, sizeof msg);
 								    msg.msg_iov = iov;
 								    msg.msg_iovlen = 2;
-												netlink-socket: Work around upstream kernel Netlink bug.

The upstream kernel net/netlink/af_netlink.c netlink_recvmsg() contains the
following code to refill the Netlink socket buffer with more dump skbs
while a dump is in progress:

	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}

The netlink_dump() function that this calls returns a negative number on
error, the convention used throughout the kernel, and thus sk->sk_err
receives a negative value on error.

However, sk->sk_err is supposed to contain either 0 or a positive errno
value, as one can see from a quick "grep" through net for 'sk_err =', e.g.:

    ipv4/tcp.c:2067:		sk->sk_err = ECONNRESET;
    ipv4/tcp.c:2069:		sk->sk_err = ECONNRESET;
    ipv4/tcp_input.c:4106:		sk->sk_err = ECONNREFUSED;
    ipv4/tcp_input.c:4109:		sk->sk_err = EPIPE;
    ipv4/tcp_input.c:4114:		sk->sk_err = ECONNRESET;
    netlink/af_netlink.c:741:			sk->sk_err = ENOBUFS;
    netlink/af_netlink.c:1796:			sk->sk_err = ENOBUFS;
    packet/af_packet.c:2476:		sk->sk_err = ENETDOWN;
    unix/af_unix.c:341:			other->sk_err = ECONNRESET;
    unix/af_unix.c:407:				skpair->sk_err = ECONNRESET;

The result is that the next attempt to receive from the socket will return
the error to userspace with the wrong sign.

(The root of the error in this case is that multiple threads are attempting
to read a single flow dump from a shared fd.  That should work, but the
kernel has an internal race that can result in one or more of those threads
hitting the EINVAL case at the start of netlink_dump().  The EINVAL is
harmless in this case and userspace should be able to ignore it, but
reporting the EINVAL as if it were a 22-byte message received in userspace
throws a real wrench in the works.)

This bug makes me think that there are probably not many programs doing
multithreaded Netlink dumps.  Maybe it is good that we are considering
other approaches.

VMware-BZ: #1255704
Reported-by: Mihir Gangar <gangarm@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-06-30 14:57:42 -07:00
+								    /* Receive a Netlink message from the kernel.
 								     *
 								     * This works around a kernel bug in which the kernel returns an error code
 								     * as if it were the number of bytes read.  It doesn't actually modify
 								     * anything in the receive buffer in that case, so we can initialize the
 								     * Netlink header with an impossible message length and then, upon success,
 								     * check whether it changed. */
 								    nlmsghdr = ofpbuf_base(buf);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    do {
-												netlink-socket: Work around upstream kernel Netlink bug.

The upstream kernel net/netlink/af_netlink.c netlink_recvmsg() contains the
following code to refill the Netlink socket buffer with more dump skbs
while a dump is in progress:

	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}

The netlink_dump() function that this calls returns a negative number on
error, the convention used throughout the kernel, and thus sk->sk_err
receives a negative value on error.

However, sk->sk_err is supposed to contain either 0 or a positive errno
value, as one can see from a quick "grep" through net for 'sk_err =', e.g.:

    ipv4/tcp.c:2067:		sk->sk_err = ECONNRESET;
    ipv4/tcp.c:2069:		sk->sk_err = ECONNRESET;
    ipv4/tcp_input.c:4106:		sk->sk_err = ECONNREFUSED;
    ipv4/tcp_input.c:4109:		sk->sk_err = EPIPE;
    ipv4/tcp_input.c:4114:		sk->sk_err = ECONNRESET;
    netlink/af_netlink.c:741:			sk->sk_err = ENOBUFS;
    netlink/af_netlink.c:1796:			sk->sk_err = ENOBUFS;
    packet/af_packet.c:2476:		sk->sk_err = ENETDOWN;
    unix/af_unix.c:341:			other->sk_err = ECONNRESET;
    unix/af_unix.c:407:				skpair->sk_err = ECONNRESET;

The result is that the next attempt to receive from the socket will return
the error to userspace with the wrong sign.

(The root of the error in this case is that multiple threads are attempting
to read a single flow dump from a shared fd.  That should work, but the
kernel has an internal race that can result in one or more of those threads
hitting the EINVAL case at the start of netlink_dump().  The EINVAL is
harmless in this case and userspace should be able to ignore it, but
reporting the EINVAL as if it were a 22-byte message received in userspace
throws a real wrench in the works.)

This bug makes me think that there are probably not many programs doing
multithreaded Netlink dumps.  Maybe it is good that we are considering
other approaches.

VMware-BZ: #1255704
Reported-by: Mihir Gangar <gangarm@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-06-30 14:57:42 -07:00
+								        nlmsghdr->nlmsg_len = UINT32_MAX;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								        boolean result = false;
 								        DWORD last_error = 0;
 								        result = ReadFile(sock->handle, tail, MAX_STACK_LENGTH, &retval, NULL);
 								        last_error = GetLastError();
 								        if (last_error != ERROR_SUCCESS && !result) {
 								            retval = -1;
 								            errno = EAGAIN;
 								        } else {
 								            ofpbuf_put(buf, tail, retval);
 								        }
 								#else
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								        retval = recvmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT);
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Work around upstream kernel Netlink bug.

The upstream kernel net/netlink/af_netlink.c netlink_recvmsg() contains the
following code to refill the Netlink socket buffer with more dump skbs
while a dump is in progress:

	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}

The netlink_dump() function that this calls returns a negative number on
error, the convention used throughout the kernel, and thus sk->sk_err
receives a negative value on error.

However, sk->sk_err is supposed to contain either 0 or a positive errno
value, as one can see from a quick "grep" through net for 'sk_err =', e.g.:

    ipv4/tcp.c:2067:		sk->sk_err = ECONNRESET;
    ipv4/tcp.c:2069:		sk->sk_err = ECONNRESET;
    ipv4/tcp_input.c:4106:		sk->sk_err = ECONNREFUSED;
    ipv4/tcp_input.c:4109:		sk->sk_err = EPIPE;
    ipv4/tcp_input.c:4114:		sk->sk_err = ECONNRESET;
    netlink/af_netlink.c:741:			sk->sk_err = ENOBUFS;
    netlink/af_netlink.c:1796:			sk->sk_err = ENOBUFS;
    packet/af_packet.c:2476:		sk->sk_err = ENETDOWN;
    unix/af_unix.c:341:			other->sk_err = ECONNRESET;
    unix/af_unix.c:407:				skpair->sk_err = ECONNRESET;

The result is that the next attempt to receive from the socket will return
the error to userspace with the wrong sign.

(The root of the error in this case is that multiple threads are attempting
to read a single flow dump from a shared fd.  That should work, but the
kernel has an internal race that can result in one or more of those threads
hitting the EINVAL case at the start of netlink_dump().  The EINVAL is
harmless in this case and userspace should be able to ignore it, but
reporting the EINVAL as if it were a 22-byte message received in userspace
throws a real wrench in the works.)

This bug makes me think that there are probably not many programs doing
multithreaded Netlink dumps.  Maybe it is good that we are considering
other approaches.

VMware-BZ: #1255704
Reported-by: Mihir Gangar <gangarm@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-06-30 14:57:42 -07:00
+								        error = (retval < 0 ? errno
 								                 : retval == 0 ? ECONNRESET /* not possible? */
 								                 : nlmsghdr->nlmsg_len != UINT32_MAX ? 0
-												netlink-socket: Fix sign of error code.

Commit 8f20fd98db (netlink-socket: Work around upstream kernel Netlink
bug.) got the sign of the error code wrong, so that it reported e.g. -22
for EINVAL to nl_sock_recv__()'s caller, instead of 22.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-07-10 14:32:10 -07:00
+								                 : retval);
-												netlink-socket: Work around upstream kernel Netlink bug.

The upstream kernel net/netlink/af_netlink.c netlink_recvmsg() contains the
following code to refill the Netlink socket buffer with more dump skbs
while a dump is in progress:

	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}

The netlink_dump() function that this calls returns a negative number on
error, the convention used throughout the kernel, and thus sk->sk_err
receives a negative value on error.

However, sk->sk_err is supposed to contain either 0 or a positive errno
value, as one can see from a quick "grep" through net for 'sk_err =', e.g.:

    ipv4/tcp.c:2067:		sk->sk_err = ECONNRESET;
    ipv4/tcp.c:2069:		sk->sk_err = ECONNRESET;
    ipv4/tcp_input.c:4106:		sk->sk_err = ECONNREFUSED;
    ipv4/tcp_input.c:4109:		sk->sk_err = EPIPE;
    ipv4/tcp_input.c:4114:		sk->sk_err = ECONNRESET;
    netlink/af_netlink.c:741:			sk->sk_err = ENOBUFS;
    netlink/af_netlink.c:1796:			sk->sk_err = ENOBUFS;
    packet/af_packet.c:2476:		sk->sk_err = ENETDOWN;
    unix/af_unix.c:341:			other->sk_err = ECONNRESET;
    unix/af_unix.c:407:				skpair->sk_err = ECONNRESET;

The result is that the next attempt to receive from the socket will return
the error to userspace with the wrong sign.

(The root of the error in this case is that multiple threads are attempting
to read a single flow dump from a shared fd.  That should work, but the
kernel has an internal race that can result in one or more of those threads
hitting the EINVAL case at the start of netlink_dump().  The EINVAL is
harmless in this case and userspace should be able to ignore it, but
reporting the EINVAL as if it were a 22-byte message received in userspace
throws a real wrench in the works.)

This bug makes me think that there are probably not many programs doing
multithreaded Netlink dumps.  Maybe it is good that we are considering
other approaches.

VMware-BZ: #1255704
Reported-by: Mihir Gangar <gangarm@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-06-30 14:57:42 -07:00
+								    } while (error == EINTR);
 								    if (error) {
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								        if (error == ENOBUFS) {
 								            /* Socket receive buffer overflow dropped one or more messages that
 								             * the kernel tried to send to us. */
 								            COVERAGE_INC(netlink_overflow);
 								        }
 								        return error;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    if (msg.msg_flags & MSG_TRUNC) {
-												Avoid printf type modifiers not supported by MSVC C runtime library.

The MSVC C library printf() implementation does not support the 'z', 't',
'j', or 'hh' format specifiers.  This commit changes the Open vSwitch code
to avoid those format specifiers, switching to standard macros from
<inttypes.h> where available and inventing new macros resembling them
where necessary.  It also updates CodingStyle to specify the macros' use
and adds a Makefile rule to report violations.

Signed-off-by: Alin Serdean <aserdean@cloudbasesolutions.com>
Co-authored-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-25 23:38:48 -08:00
+								        VLOG_ERR_RL(&rl, "truncated message (longer than %"PRIuSIZE" bytes)",
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								                    sizeof tail);
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								        return E2BIG;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								    if (retval < sizeof *nlmsghdr
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
-												netlink-socket: Reduce nl_sock_recv() from 2 (or more) system calls to 1.

Until now, each attempt to receive a message from a Netlink socket has
taken at least two system calls, one to check the size of the message to
be received and a second one to delete the message from the socket buffer.
This commit switches to a new strategy that requires only one system call
per message received.

In my testing this increases the maximum flow setups per second by a little
over 10%.

											
										
										
											2011-07-27 14:56:03 -07:00
+								        || nlmsghdr->nlmsg_len > retval) {
-												Fix log message weird suffixes.

I think these were leftovers from the removal of %z for MSVC that happened
some time ago.

VMware-BZ: 1265762
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Pritesh Kothari <pritesh.kothari@cisco.com>

											
										
										
											2014-06-11 09:14:54 -07:00
+								        VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE" bytes < %"PRIuSIZE")",
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								                    retval, sizeof *nlmsghdr);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        return EPROTO;
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifndef _WIN32
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								    ofpbuf_set_size(buf, MIN(retval, buf->allocated));
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    if (retval > buf->allocated) {
 								        COVERAGE_INC(netlink_recv_jumbo);
 								        ofpbuf_put(buf, tail, retval - buf->allocated);
 								    }
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								    log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    COVERAGE_INC(netlink_received);
 								    return 0;
 								}
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								/* Tries to receive a Netlink message from the kernel on 'sock' into 'buf'.  If
 								 * 'wait' is true, waits for a message to be ready.  Otherwise, fails with
 								 * EAGAIN if the 'sock' receive buffer is empty.
 								 *
 								 * The caller must have initialized 'buf' with an allocation of at least
 								 * NLMSG_HDRLEN bytes.  For best performance, the caller should allocate enough
 								 * space for a "typical" message.
 								 *
 								 * On success, returns 0 and replaces 'buf''s previous content by the received
 								 * message.  This function expands 'buf''s allocated memory, as necessary, to
 								 * hold the actual size of the received message.
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								 *
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								 * On failure, returns a positive errno value and clears 'buf' to zero length.
 								 * 'buf' retains its previous memory allocation.
 								 *
 								 * Regardless of success or failure, this function resets 'buf''s headroom to
 								 * 0. */
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								int
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
-												netlink-socket: Make dumping and doing transactions on same nl_sock safe.

It's not safe to use a single Netlink fd to do multiple operations in an
synchronous way.  Some of the limitations are fundamental; for example, the
kernel only supports a single "dump" operation at a time.  Others are
limitations imposed by the OVS coding style; for example, our Netlink
library is not callback based, so nothing can be done about incoming
messages that can't be handled immediately.  Regardless, in OVS multicast
groups, transactions, and dumps cannot coexist on a single nl_sock.

This is only mildly irritating at the moment, but it will become much worse
later on, when dpif-linux shifts to using Netlink dumps for listing various
kinds of datapath entities.  When that happens, a dump will be in progress
in situations where the dpif-linux client might want to do other
operations.  For example, it is reasonable for the client to list flows
and, in the middle, look up information on vports mentioned in those flows.
It might be possible to simply ban and avoid such nested operations--I have
not even audited the source tree to find out whether we do anything like
that already--but that seems like an unnecessary cramp on our coding style.
Furthermore, it's difficult to explain and justify without understanding
the implementation.

This patch takes another approach, by improving the Netlink socket library
to avoid artificial constraints.  When an operation, or a dump, or joining
a multicast group would cause a problem, this patch makes the library
transparently create a separate Netlink socket.  This solves the problem
without putting any onerous restrictions on use.

This commit also slightly simplifies netdev_vport_reset_names().  It had
been written to destroy the dump object before the Netlink socket that it
used, but this is no longer necessary and doing it in the opposite order
saved a few lines of code.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-22 15:23:10 -08:00
+								{
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    return nl_sock_recv__(sock, buf, wait);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								}
 								static void
 								nl_sock_record_errors__(struct nl_transaction **transactions, size_t n,
 								                        int error)
 								{
 								    size_t i;
 								    for (i = 0; i < n; i++) {
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        struct nl_transaction *txn = transactions[i];
 								        txn->error = error;
 								        if (txn->reply) {
 								            ofpbuf_clear(txn->reply);
 								        }
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    }
 								}
 								static int
 								nl_sock_transact_multiple__(struct nl_sock *sock,
 								                            struct nl_transaction **transactions, size_t n,
 								                            size_t *done)
 								{
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    uint64_t tmp_reply_stub[1024 / 8];
 								    struct nl_transaction tmp_txn;
 								    struct ofpbuf tmp_reply;
 								    uint32_t base_seq;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    struct iovec iovs[MAX_IOVS];
 								    struct msghdr msg;
 								    int error;
 								    int i;
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    base_seq = nl_sock_allocate_seq(sock, n);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    *done = 0;
 								    for (i = 0; i < n; i++) {
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        struct nl_transaction *txn = transactions[i];
 								        struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(txn->request);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								        nlmsg->nlmsg_len = ofpbuf_size(txn->request);
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        nlmsg->nlmsg_seq = base_seq + i;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        nlmsg->nlmsg_pid = sock->pid;
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								        iovs[i].iov_base = ofpbuf_data(txn->request);
 								        iovs[i].iov_len = ofpbuf_size(txn->request);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    }
 								    memset(&msg, 0, sizeof msg);
 								    msg.msg_iov = iovs;
 								    msg.msg_iovlen = n;
 								    do {
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    DWORD last_error = 0;
 								    bool result = FALSE;
 								    for (i = 0; i < n; i++) {
 								        result = WriteFile((HANDLE)sock->handle, iovs[i].iov_base, iovs[i].iov_len,
 								                           &error, NULL);
 								        last_error = GetLastError();
 								        if (last_error != ERROR_SUCCESS && !result) {
 								            error = EAGAIN;
 								            errno = EAGAIN;
 								        } else {
 								            error = 0;
 								        }
 								    }
 								#else
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    } while (error == EINTR);
 								    for (i = 0; i < n; i++) {
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        struct nl_transaction *txn = transactions[i];
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								        log_nlmsg(__func__, error, ofpbuf_data(txn->request), ofpbuf_size(txn->request),
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								                  sock->protocol);
 								    }
 								    if (!error) {
 								        COVERAGE_ADD(netlink_sent, n);
 								    }
 								    if (error) {
 								        return error;
 								    }
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    ofpbuf_use_stub(&tmp_reply, tmp_reply_stub, sizeof tmp_reply_stub);
 								    tmp_txn.request = NULL;
 								    tmp_txn.reply = &tmp_reply;
 								    tmp_txn.error = 0;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    while (n > 0) {
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        struct nl_transaction *buf_txn, *txn;
 								        uint32_t seq;
 								        /* Find a transaction whose buffer we can use for receiving a reply.
 								         * If no such transaction is left, use tmp_txn. */
 								        buf_txn = &tmp_txn;
 								        for (i = 0; i < n; i++) {
 								            if (transactions[i]->reply) {
 								                buf_txn = transactions[i];
 								                break;
 								            }
 								        }
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        /* Receive a reply. */
 								        error = nl_sock_recv__(sock, buf_txn->reply, false);
 								        if (error) {
 								            if (error == EAGAIN) {
 								                nl_sock_record_errors__(transactions, n, 0);
 								                *done += n;
 								                error = 0;
 								            }
 								            break;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        }
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        /* Match the reply up with a transaction. */
 								        seq = nl_msg_nlmsghdr(buf_txn->reply)->nlmsg_seq;
 								        if (seq < base_seq || seq >= base_seq + n) {
 								            VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32, seq);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								            continue;
 								        }
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        i = seq - base_seq;
 								        txn = transactions[i];
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        /* Fill in the results for 'txn'. */
 								        if (nl_msg_nlmsgerr(buf_txn->reply, &txn->error)) {
 								            if (txn->reply) {
 								                ofpbuf_clear(txn->reply);
 								            }
 								            if (txn->error) {
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								                VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                            error, ovs_strerror(txn->error));
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								            }
 								        } else {
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								            txn->error = 0;
 								            if (txn->reply && txn != buf_txn) {
 								                /* Swap buffers. */
 								                struct ofpbuf *reply = buf_txn->reply;
 								                buf_txn->reply = txn->reply;
 								                txn->reply = reply;
 								            }
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        }
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        /* Fill in the results for transactions before 'txn'.  (We have to do
 								         * this after the results for 'txn' itself because of the buffer swap
 								         * above.) */
 								        nl_sock_record_errors__(transactions, i, 0);
 								        /* Advance. */
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        *done += i + 1;
 								        transactions += i + 1;
 								        n -= i + 1;
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        base_seq += i + 1;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    }
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    ofpbuf_uninit(&tmp_reply);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    return error;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								}
-												netlink-socket: Add conceptual documentation.

Based on a conversation with the VMware Hyper-V team earlier today.

This commit also changes a couple of functions that were only used with
netlink-socket.c into static functions.  I couldn't think of a reason for
code outside that file to use them.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 08:59:40 -07:00
+								static void
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								nl_sock_transact_multiple(struct nl_sock *sock,
 								                          struct nl_transaction **transactions, size_t n)
 								{
 								    int max_batch_count;
 								    int error;
 								    if (!n) {
 								        return;
 								    }
 								    /* In theory, every request could have a 64 kB reply.  But the default and
 								     * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to
 								     * be a bit below 128 kB, so that would only allow a single message in a
 								     * "batch".  So we assume that replies average (at most) 4 kB, which allows
 								     * a good deal of batching.
 								     *
 								     * In practice, most of the requests that we batch either have no reply at
 								     * all or a brief reply. */
 								    max_batch_count = MAX(sock->rcvbuf / 4096, 1);
 								    max_batch_count = MIN(max_batch_count, max_iovs);
 								    while (n > 0) {
 								        size_t count, bytes;
 								        size_t done;
 								        /* Batch up to 'max_batch_count' transactions.  But cap it at about a
 								         * page of requests total because big skbuffs are expensive to
 								         * allocate in the kernel.  */
 								#if defined(PAGESIZE)
 								        enum { MAX_BATCH_BYTES = MAX(1, PAGESIZE - 512) };
 								#else
 								        enum { MAX_BATCH_BYTES = 4096 - 512 };
 								#endif
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								        bytes = ofpbuf_size(transactions[0]->request);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        for (count = 1; count < n && count < max_batch_count; count++) {
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								            if (bytes + ofpbuf_size(transactions[count]->request) > MAX_BATCH_BYTES) {
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								                break;
 								            }
-												ofpbuf: Introduce access api for base, data and size.

These functions will be used by later patches.  Following patch
does not change functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-03-30 01:31:50 -07:00
+								            bytes += ofpbuf_size(transactions[count]->request);
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								        }
 								        error = nl_sock_transact_multiple__(sock, transactions, count, &done);
 								        transactions += done;
 								        n -= done;
 								        if (error == ENOBUFS) {
 								            VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
 								        } else if (error) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								            VLOG_ERR_RL(&rl, "transaction error (%s)", ovs_strerror(error));
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								            nl_sock_record_errors__(transactions, n, error);
 								        }
 								    }
 								}
-												netlink-socket: Add conceptual documentation.

Based on a conversation with the VMware Hyper-V team earlier today.

This commit also changes a couple of functions that were only used with
netlink-socket.c into static functions.  I couldn't think of a reason for
code outside that file to use them.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 08:59:40 -07:00
+								static int
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
 								                 struct ofpbuf **replyp)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    struct nl_transaction *transactionp;
 								    struct nl_transaction transaction;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
-												util: New macro CONST_CAST.

Casts are sometimes necessary.  One common reason that they are necessary
is for discarding a "const" qualifier.  However, this can impede
maintenance: if the type of the expression being cast changes, then the
presence of the cast can hide a necessary change in the code that does the
cast.  Using CONST_CAST, instead of a bare cast, makes these changes
visible.

Inspired by my own work elsewhere:
http://git.savannah.gnu.org/cgit/pspp.git/tree/src/libpspp/cast.h#n80

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-07-13 16:00:29 -07:00
+								    transaction.request = CONST_CAST(struct ofpbuf *, request);
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								    transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    transactionp = &transaction;
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    nl_sock_transact_multiple(sock, &transactionp, 1);
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    if (replyp) {
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
+								        if (transaction.error) {
 								            ofpbuf_delete(transaction.reply);
 								            *replyp = NULL;
 								        } else {
 								            *replyp = transaction.reply;
 								        }
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
-												netlink-socket: Make caller provide message receive buffers.

Typically an nl_sock client can stack-allocate the buffer for receiving
a Netlink message, which provides a performance boost.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-09 15:35:29 -07:00
-												netlink-socket: New function nl_sock_transact_multiple().

This will be used in an upcoming commit.

											
										
										
											2011-10-14 13:55:00 -07:00
+								    return transaction.error;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								}
-												netlink-socket: New function for draining the receive buffer.

This will be used in an upcoming patch.

Reviewed by Justin Pettit.

											
										
										
											2011-01-11 16:05:37 -08:00
+								/* Drain all the messages currently in 'sock''s receive queue. */
 								int
 								nl_sock_drain(struct nl_sock *sock)
 								{
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    return 0;
 								#else
-												netlink-socket: New function for draining the receive buffer.

This will be used in an upcoming patch.

Reviewed by Justin Pettit.

											
										
										
											2011-01-11 16:05:37 -08:00
+								    return drain_rcvbuf(sock->fd);
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink-socket: New function for draining the receive buffer.

This will be used in an upcoming patch.

Reviewed by Justin Pettit.

											
										
										
											2011-01-11 16:05:37 -08:00
+								}
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								/* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
 								 * Netlink socket created with the given 'protocol', and initializes 'dump' to
 								 * reflect the state of the operation.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 *
-												netlink: Update comment for nl_dump_start().

The function comment still referred to a 'msg' variable, which has been
renamed.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
[blp@nicira.com did further proofreading]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-13 13:50:22 -08:00
+								 * 'request' must contain a Netlink message.  Before sending the message,
 								 * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
 								 * set to the Netlink socket's pid.  NLM_F_DUMP and NLM_F_ACK will be set in
 								 * nlmsg_flags.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 *
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								 * The design of this Netlink socket library ensures that the dump is reliable.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 *
-												netlink: Update comment for nl_dump_start().

The function comment still referred to a 'msg' variable, which has been
renamed.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
[blp@nicira.com did further proofreading]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-13 13:50:22 -08:00
+								 * This function provides no status indication.  nl_dump_done() provides an
 								 * error status for the entire dump operation.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 *
-												netlink: Update comment for nl_dump_start().

The function comment still referred to a 'msg' variable, which has been
renamed.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
[blp@nicira.com did further proofreading]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-13 13:50:22 -08:00
+								 * The caller must eventually destroy 'request'.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 */
 								void
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
-												netlink: Postpone choosing sequence numbers until send time.

Choosing sequence numbers at time of creating a packet means that
nl_sock_transact_multiple() has to search for the sequence number
of a reply, because the sequence numbers of the requests aren't
necessarily sequential.  This commit makes it possible to avoid
the search, by deferring choice of sequence numbers until the
time that we send the packets.  It doesn't actually modify
nl_sock_transact_multiple(), which will happen in a later commit.

Previously, I was concerned about a theoretical race condition
described in a comment in the old versino of this code:

    This implementation uses sequence numbers that are unique
    process-wide, to avoid a hypothetical race: send request, close
    socket, open new socket that reuses the old socket's PID value,
    send request on new socket, receive reply from kernel to old
    socket but with same PID and sequence number.  (This race could be
    avoided other ways, e.g. by preventing PIDs from being quickly
    reused).

However, I no longer believe that this can be a real problem,
because Netlink operates synchronously.  The reply to a request
will always arrive before the socket can be closed and a new
socket opened with the old socket's PID.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-16 16:01:01 -07:00
+								    nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
 								    ovs_mutex_init(&dump->mutex);
 								    ovs_mutex_lock(&dump->mutex);
 								    dump->status = nl_pool_alloc(protocol, &dump->sock);
 								    if (!dump->status) {
 								        dump->status = nl_sock_send__(dump->sock, request,
 								                                      nl_sock_allocate_seq(dump->sock, 1),
 								                                      true);
-												netlink-socket: Fix handling socket allocation failure in nl_dump_start().

If nl_pool_alloc() failed, then 'dump' was not initialized at all and
further use of the dump would access uninitialized data, probably causing
a crash.

Found by inspection.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2014-07-14 14:06:03 -07:00
+								    }
-												netlink: Rename 'dump->seq' to 'dump->nl_seq'

An upcoming patch will introduce another, completely unrelated seq to
'struct nl_dump'. Giving this one a better name should reduce confusion.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-21 11:29:26 -08:00
+								    dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    ovs_mutex_unlock(&dump->mutex);
 								}
 								static int
 								nl_dump_refill(struct nl_dump *dump, struct ofpbuf *buffer)
 								    OVS_REQUIRES(dump->mutex)
 								{
 								    struct nlmsghdr *nlmsghdr;
 								    int error;
 								    while (!ofpbuf_size(buffer)) {
-												netlink-socket: Do not make flow_dump block on netlink socket.

Commit 93295354 (netlink-socket: Simplify multithreaded dumping
to match Linux reality.) makes the call to recvmsg() block if no
messages are available.  This can cause revalidator threads hanging
for long time or even deadlock when main thread tries to stop the
revalidator threads.

This commit fixes the issue by enabling the MSG_DONTWAIT flag in
the call to recvmsg().

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-18 14:27:36 -07:00
+								        error = nl_sock_recv__(dump->sock, buffer, false);
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								        if (error) {
-												netlink-socket: Do not make flow_dump block on netlink socket.

Commit 93295354 (netlink-socket: Simplify multithreaded dumping
to match Linux reality.) makes the call to recvmsg() block if no
messages are available.  This can cause revalidator threads hanging
for long time or even deadlock when main thread tries to stop the
revalidator threads.

This commit fixes the issue by enabling the MSG_DONTWAIT flag in
the call to recvmsg().

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-18 14:27:36 -07:00
+								            /* The kernel never blocks providing the results of a dump, so
 								             * error == EAGAIN means that we've read the whole thing, and
 								             * therefore transform it into EOF.  (The kernel always provides
 								             * NLMSG_DONE as a sentinel.  Some other thread must have received
 								             * that already but not yet signaled it in 'status'.)
 								             *
 								             * Any other error is just an error. */
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								            return error == EAGAIN ? EOF : error;
 								        }
 								        nlmsghdr = nl_msg_nlmsghdr(buffer);
 								        if (dump->nl_seq != nlmsghdr->nlmsg_seq) {
 								            VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32,
 								                        nlmsghdr->nlmsg_seq, dump->nl_seq);
 								            ofpbuf_clear(buffer);
 								        }
 								    }
 								    if (nl_msg_nlmsgerr(buffer, &error) && error) {
 								        VLOG_INFO_RL(&rl, "netlink dump request error (%s)",
 								                     ovs_strerror(error));
 								        ofpbuf_clear(buffer);
 								        return error;
 								    }
 								    return 0;
 								}
 								static int
 								nl_dump_next__(struct ofpbuf *reply, struct ofpbuf *buffer)
 								{
 								    struct nlmsghdr *nlmsghdr = nl_msg_next(buffer, reply);
 								    if (!nlmsghdr) {
 								        VLOG_WARN_RL(&rl, "netlink dump contains message fragment");
 								        return EPROTO;
 								    } else if (nlmsghdr->nlmsg_type == NLMSG_DONE) {
 								        return EOF;
 								    } else {
 								        return 0;
 								    }
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								}
-												netlink: Remove buffer from 'struct nl_dump'.

This patch makes all of the users of 'struct nl_dump' allocate their own
buffers to pass down to nl_dump_next(). This paves the way for allowing
multithreaded flow dumping.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:05 -08:00
+								/* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
 								 * have been initialized with nl_dump_start(), and 'buffer' must have been
 								 * initialized. 'buffer' should be at least NL_DUMP_BUFSIZE bytes long.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 *
-												netlink-socket: Refill comment to fit within 79 columns.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2014-07-14 13:40:18 -07:00
+								 * If successful, returns true and points 'reply->data' and
 								 * 'ofpbuf_size(reply)' to the message that was retrieved. The caller must not
 								 * modify 'reply' (because it points within 'buffer', which will be used by
 								 * future calls to this function).
 								 *
 								 * On failure, returns false and sets 'reply->data' to NULL and
 								 * 'ofpbuf_size(reply)' to 0.  Failure might indicate an actual error or merely
 								 * the end of replies.  An error status for the entire dump operation is
 								 * provided when it is completed by calling nl_dump_done().
-												netlink: Make nl_dump_next() thread-safe.

This patch modifies 'struct nl_dump' and nl_dump_next() to allow
multiple threads to share the same nl_dump. These changes are targeted
around synchronizing dump status between multiple callers, and
allowing callers to fully process their existing buffers before
determining whether to stop fetching flows.

The 'status' field of 'struct nl_dump' becomes atomic, so that multiple
threads may check and/or update it to communicate when there is an error
or the netlink dump is finished. The low bit holds whether the final
message was seen, while the higher bits hold an errno value.

nl_dump_next() will now read all messages from the given buffer before
checking the shared error status and attempting to fetch more. Multiple
threads may call this with the same nl_dump, but must provide
independent buffers. As previously, the final dump status can be
determined by calling nl_dump_done() from a single thread.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:06 -08:00
+								 *
 								 * Multiple threads may call this function, passing the same nl_dump, however
 								 * each must provide independent buffers. This function may cache multiple
 								 * replies in the buffer, and these will be processed before more replies are
 								 * fetched. When this function returns false, other threads may continue to
 								 * process replies in their buffers, but they will not fetch more replies.
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								 */
 								bool
-												netlink: Remove buffer from 'struct nl_dump'.

This patch makes all of the users of 'struct nl_dump' allocate their own
buffers to pass down to nl_dump_next(). This paves the way for allowing
multithreaded flow dumping.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:05 -08:00
+								nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    int retval = 0;
-												netlink: Make nl_dump_next() thread-safe.

This patch modifies 'struct nl_dump' and nl_dump_next() to allow
multiple threads to share the same nl_dump. These changes are targeted
around synchronizing dump status between multiple callers, and
allowing callers to fully process their existing buffers before
determining whether to stop fetching flows.

The 'status' field of 'struct nl_dump' becomes atomic, so that multiple
threads may check and/or update it to communicate when there is an error
or the netlink dump is finished. The low bit holds whether the final
message was seen, while the higher bits hold an errno value.

nl_dump_next() will now read all messages from the given buffer before
checking the shared error status and attempting to fetch more. Multiple
threads may call this with the same nl_dump, but must provide
independent buffers. As previously, the final dump status can be
determined by calling nl_dump_done() from a single thread.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:06 -08:00
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    /* If the buffer is empty, refill it.
 								     *
 								     * If the buffer is not empty, we don't check the dump's status.
 								     * Otherwise, we could end up skipping some of the dump results if thread A
 								     * hits EOF while thread B is in the midst of processing a batch. */
 								    if (!ofpbuf_size(buffer)) {
-												netlink-socket: Work around kernel Netlink dump thread races.

The Linux kernel Netlink implementation has two races that cause problems
for processes that attempt to dump a table in a multithreaded manner.

The first race is in the structure of the kernel netlink_recv() function.
This function pulls a message from the socket queue and, if there is none,
reports EAGAIN:
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
		goto out;
Only if a message is successfully read from the socket queue does the
function, toward the end, try to queue up a new message to be dumped:
	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}
This means that if thread A reads a message from a dump, then thread B
attempts to read one before A queues up the next, B will get EAGAIN.  This
means that, following EAGAIN, B needs to wait until A returns to userspace
before it tries to read the socket again.  nl_dump_next() already does
this, using 'dump->status_seq' (although the need for it has never been
explained clearly, to my knowledge).

The second race is more serious.  Suppose thread X and thread Y both
simultaneously attempt to queue up a new message to be dumped, using the
call to netlink_dump() quoted above.  netlink_dump() begins with:
	mutex_lock(nlk->cb_mutex);

	cb = nlk->cb;
	if (cb == NULL) {
		err = -EINVAL;
		goto errout_skb;
	}
Suppose that X gets cb_mutex first and finds that the dump is complete.  It
will therefore, toward the end of netlink_dump(), clear nlk->cb to NULL to
indicate that no dump is in progress and release the mutex:
	nlk->cb = NULL;
	mutex_unlock(nlk->cb_mutex);
When Y grabs cb_mutex afterward, it will see that nlk->cb is NULL and
return -EINVAL as quoted above.  netlink_recv() stuffs -EINVAL in sk_err,
but that error is not reported immediately; instead, it is saved for the
next read from the socket.  Since Open vSwitch maintains a pool of Netlink
sockets, that next failure can crop up pretty much anywhere.  One of the
worst places for it to crop up is in the execution of a later transaction
(e.g. in nl_sock_transact_multiple__()), because userspace treats Netlink
transactions as idempotent and will re-execute them when socket errors
occur.  For a transaction that sends a packet, this causes packet
duplication, which we actually observed in practice.  (ENOBUFS should
actually cause transactions to be re-executed in many cases, but EINVAL
should not; this is a separate bug in the userspace netlink code.)

VMware-BZ: #1283188
Reported-and-tested-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-07-10 16:48:16 -07:00
+								        ovs_mutex_lock(&dump->mutex);
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								        if (!dump->status) {
 								            /* Take the mutex here to avoid an in-kernel race.  If two threads
 								             * try to read from a Netlink dump socket at once, then the socket
 								             * error can be set to EINVAL, which will be encountered on the
 								             * next recv on that socket, which could be anywhere due to the way
 								             * that we pool Netlink sockets.  Serializing the recv calls avoids
 								             * the issue. */
 								            dump->status = nl_dump_refill(dump, buffer);
 								        }
 								        retval = dump->status;
-												netlink-socket: Work around kernel Netlink dump thread races.

The Linux kernel Netlink implementation has two races that cause problems
for processes that attempt to dump a table in a multithreaded manner.

The first race is in the structure of the kernel netlink_recv() function.
This function pulls a message from the socket queue and, if there is none,
reports EAGAIN:
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
		goto out;
Only if a message is successfully read from the socket queue does the
function, toward the end, try to queue up a new message to be dumped:
	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}
This means that if thread A reads a message from a dump, then thread B
attempts to read one before A queues up the next, B will get EAGAIN.  This
means that, following EAGAIN, B needs to wait until A returns to userspace
before it tries to read the socket again.  nl_dump_next() already does
this, using 'dump->status_seq' (although the need for it has never been
explained clearly, to my knowledge).

The second race is more serious.  Suppose thread X and thread Y both
simultaneously attempt to queue up a new message to be dumped, using the
call to netlink_dump() quoted above.  netlink_dump() begins with:
	mutex_lock(nlk->cb_mutex);

	cb = nlk->cb;
	if (cb == NULL) {
		err = -EINVAL;
		goto errout_skb;
	}
Suppose that X gets cb_mutex first and finds that the dump is complete.  It
will therefore, toward the end of netlink_dump(), clear nlk->cb to NULL to
indicate that no dump is in progress and release the mutex:
	nlk->cb = NULL;
	mutex_unlock(nlk->cb_mutex);
When Y grabs cb_mutex afterward, it will see that nlk->cb is NULL and
return -EINVAL as quoted above.  netlink_recv() stuffs -EINVAL in sk_err,
but that error is not reported immediately; instead, it is saved for the
next read from the socket.  Since Open vSwitch maintains a pool of Netlink
sockets, that next failure can crop up pretty much anywhere.  One of the
worst places for it to crop up is in the execution of a later transaction
(e.g. in nl_sock_transact_multiple__()), because userspace treats Netlink
transactions as idempotent and will re-execute them when socket errors
occur.  For a transaction that sends a packet, this causes packet
duplication, which we actually observed in practice.  (ENOBUFS should
actually cause transactions to be re-executed in many cases, but EINVAL
should not; this is a separate bug in the userspace netlink code.)

VMware-BZ: #1283188
Reported-and-tested-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-07-10 16:48:16 -07:00
+								        ovs_mutex_unlock(&dump->mutex);
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    }
-												netlink-socket: Work around kernel Netlink dump thread races.

The Linux kernel Netlink implementation has two races that cause problems
for processes that attempt to dump a table in a multithreaded manner.

The first race is in the structure of the kernel netlink_recv() function.
This function pulls a message from the socket queue and, if there is none,
reports EAGAIN:
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
		goto out;
Only if a message is successfully read from the socket queue does the
function, toward the end, try to queue up a new message to be dumped:
	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}
This means that if thread A reads a message from a dump, then thread B
attempts to read one before A queues up the next, B will get EAGAIN.  This
means that, following EAGAIN, B needs to wait until A returns to userspace
before it tries to read the socket again.  nl_dump_next() already does
this, using 'dump->status_seq' (although the need for it has never been
explained clearly, to my knowledge).

The second race is more serious.  Suppose thread X and thread Y both
simultaneously attempt to queue up a new message to be dumped, using the
call to netlink_dump() quoted above.  netlink_dump() begins with:
	mutex_lock(nlk->cb_mutex);

	cb = nlk->cb;
	if (cb == NULL) {
		err = -EINVAL;
		goto errout_skb;
	}
Suppose that X gets cb_mutex first and finds that the dump is complete.  It
will therefore, toward the end of netlink_dump(), clear nlk->cb to NULL to
indicate that no dump is in progress and release the mutex:
	nlk->cb = NULL;
	mutex_unlock(nlk->cb_mutex);
When Y grabs cb_mutex afterward, it will see that nlk->cb is NULL and
return -EINVAL as quoted above.  netlink_recv() stuffs -EINVAL in sk_err,
but that error is not reported immediately; instead, it is saved for the
next read from the socket.  Since Open vSwitch maintains a pool of Netlink
sockets, that next failure can crop up pretty much anywhere.  One of the
worst places for it to crop up is in the execution of a later transaction
(e.g. in nl_sock_transact_multiple__()), because userspace treats Netlink
transactions as idempotent and will re-execute them when socket errors
occur.  For a transaction that sends a packet, this causes packet
duplication, which we actually observed in practice.  (ENOBUFS should
actually cause transactions to be re-executed in many cases, but EINVAL
should not; this is a separate bug in the userspace netlink code.)

VMware-BZ: #1283188
Reported-and-tested-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-07-10 16:48:16 -07:00
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    /* Fetch the next message from the buffer. */
 								    if (!retval) {
 								        retval = nl_dump_next__(reply, buffer);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        if (retval) {
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								            /* Record 'retval' as the dump status, but don't overwrite an error
 								             * with EOF.  */
 								            ovs_mutex_lock(&dump->mutex);
 								            if (dump->status <= 0) {
 								                dump->status = retval;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								            }
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								            ovs_mutex_unlock(&dump->mutex);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        }
 								    }
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    if (retval) {
 								        ofpbuf_set_data(reply, NULL);
 								        ofpbuf_set_size(reply, 0);
-												netlink: Make nl_dump_next() thread-safe.

This patch modifies 'struct nl_dump' and nl_dump_next() to allow
multiple threads to share the same nl_dump. These changes are targeted
around synchronizing dump status between multiple callers, and
allowing callers to fully process their existing buffers before
determining whether to stop fetching flows.

The 'status' field of 'struct nl_dump' becomes atomic, so that multiple
threads may check and/or update it to communicate when there is an error
or the netlink dump is finished. The low bit holds whether the final
message was seen, while the higher bits hold an errno value.

nl_dump_next() will now read all messages from the given buffer before
checking the shared error status and attempting to fetch more. Multiple
threads may call this with the same nl_dump, but must provide
independent buffers. As previously, the final dump status can be
determined by calling nl_dump_done() from a single thread.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:06 -08:00
+								    }
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    return !retval;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								}
 								/* Completes Netlink dump operation 'dump', which must have been initialized
 								 * with nl_dump_start().  Returns 0 if the dump operation was error-free,
 								 * otherwise a positive errno value describing the problem. */
 								int
 								nl_dump_done(struct nl_dump *dump)
 								{
-												netlink: Make nl_dump_next() thread-safe.

This patch modifies 'struct nl_dump' and nl_dump_next() to allow
multiple threads to share the same nl_dump. These changes are targeted
around synchronizing dump status between multiple callers, and
allowing callers to fully process their existing buffers before
determining whether to stop fetching flows.

The 'status' field of 'struct nl_dump' becomes atomic, so that multiple
threads may check and/or update it to communicate when there is an error
or the netlink dump is finished. The low bit holds whether the final
message was seen, while the higher bits hold an errno value.

nl_dump_next() will now read all messages from the given buffer before
checking the shared error status and attempting to fetch more. Multiple
threads may call this with the same nl_dump, but must provide
independent buffers. As previously, the final dump status can be
determined by calling nl_dump_done() from a single thread.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:06 -08:00
+								    int status;
-												netlink: Remove buffer from 'struct nl_dump'.

This patch makes all of the users of 'struct nl_dump' allocate their own
buffers to pass down to nl_dump_next(). This paves the way for allowing
multithreaded flow dumping.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:05 -08:00
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
+								    ovs_mutex_lock(&dump->mutex);
 								    status = dump->status;
 								    ovs_mutex_unlock(&dump->mutex);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    /* Drain any remaining messages that the client didn't read.  Otherwise the
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								     * kernel will continue to queue them up and waste buffer space.
 								     *
 								     * XXX We could just destroy and discard the socket in this case. */
-												netlink: Make nl_dump_next() thread-safe.

This patch modifies 'struct nl_dump' and nl_dump_next() to allow
multiple threads to share the same nl_dump. These changes are targeted
around synchronizing dump status between multiple callers, and
allowing callers to fully process their existing buffers before
determining whether to stop fetching flows.

The 'status' field of 'struct nl_dump' becomes atomic, so that multiple
threads may check and/or update it to communicate when there is an error
or the netlink dump is finished. The low bit holds whether the final
message was seen, while the higher bits hold an errno value.

nl_dump_next() will now read all messages from the given buffer before
checking the shared error status and attempting to fetch more. Multiple
threads may call this with the same nl_dump, but must provide
independent buffers. As previously, the final dump status can be
determined by calling nl_dump_done() from a single thread.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:06 -08:00
+								    if (!status) {
 								        uint64_t tmp_reply_stub[NL_DUMP_BUFSIZE / 8];
 								        struct ofpbuf reply, buf;
 								        ofpbuf_use_stub(&buf, tmp_reply_stub, sizeof tmp_reply_stub);
 								        while (nl_dump_next(dump, &reply, &buf)) {
 								            /* Nothing to do. */
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        }
-												netlink: Make nl_dump_next() thread-safe.

This patch modifies 'struct nl_dump' and nl_dump_next() to allow
multiple threads to share the same nl_dump. These changes are targeted
around synchronizing dump status between multiple callers, and
allowing callers to fully process their existing buffers before
determining whether to stop fetching flows.

The 'status' field of 'struct nl_dump' becomes atomic, so that multiple
threads may check and/or update it to communicate when there is an error
or the netlink dump is finished. The low bit holds whether the final
message was seen, while the higher bits hold an errno value.

nl_dump_next() will now read all messages from the given buffer before
checking the shared error status and attempting to fetch more. Multiple
threads may call this with the same nl_dump, but must provide
independent buffers. As previously, the final dump status can be
determined by calling nl_dump_done() from a single thread.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:06 -08:00
+								        ofpbuf_uninit(&buf);
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
 								        ovs_mutex_lock(&dump->mutex);
 								        status = dump->status;
 								        ovs_mutex_unlock(&dump->mutex);
 								        ovs_assert(status);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								    nl_pool_release(dump->sock);
-												netlink-socket: Work around kernel Netlink dump thread races.

The Linux kernel Netlink implementation has two races that cause problems
for processes that attempt to dump a table in a multithreaded manner.

The first race is in the structure of the kernel netlink_recv() function.
This function pulls a message from the socket queue and, if there is none,
reports EAGAIN:
	skb = skb_recv_datagram(sk, flags, noblock, &err);
	if (skb == NULL)
		goto out;
Only if a message is successfully read from the socket queue does the
function, toward the end, try to queue up a new message to be dumped:
	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
		ret = netlink_dump(sk);
		if (ret) {
			sk->sk_err = ret;
			sk->sk_error_report(sk);
		}
	}
This means that if thread A reads a message from a dump, then thread B
attempts to read one before A queues up the next, B will get EAGAIN.  This
means that, following EAGAIN, B needs to wait until A returns to userspace
before it tries to read the socket again.  nl_dump_next() already does
this, using 'dump->status_seq' (although the need for it has never been
explained clearly, to my knowledge).

The second race is more serious.  Suppose thread X and thread Y both
simultaneously attempt to queue up a new message to be dumped, using the
call to netlink_dump() quoted above.  netlink_dump() begins with:
	mutex_lock(nlk->cb_mutex);

	cb = nlk->cb;
	if (cb == NULL) {
		err = -EINVAL;
		goto errout_skb;
	}
Suppose that X gets cb_mutex first and finds that the dump is complete.  It
will therefore, toward the end of netlink_dump(), clear nlk->cb to NULL to
indicate that no dump is in progress and release the mutex:
	nlk->cb = NULL;
	mutex_unlock(nlk->cb_mutex);
When Y grabs cb_mutex afterward, it will see that nlk->cb is NULL and
return -EINVAL as quoted above.  netlink_recv() stuffs -EINVAL in sk_err,
but that error is not reported immediately; instead, it is saved for the
next read from the socket.  Since Open vSwitch maintains a pool of Netlink
sockets, that next failure can crop up pretty much anywhere.  One of the
worst places for it to crop up is in the execution of a later transaction
(e.g. in nl_sock_transact_multiple__()), because userspace treats Netlink
transactions as idempotent and will re-execute them when socket errors
occur.  For a transaction that sends a packet, this causes packet
duplication, which we actually observed in practice.  (ENOBUFS should
actually cause transactions to be re-executed in many cases, but EINVAL
should not; this is a separate bug in the userspace netlink code.)

VMware-BZ: #1283188
Reported-and-tested-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-07-10 16:48:16 -07:00
+								    ovs_mutex_destroy(&dump->mutex);
-												netlink-socket: Simplify multithreaded dumping to match Linux reality.

Commit 0791315e4d (netlink-socket: Work around kernel Netlink dump thread
races.) introduced a simple workaround for Linux kernel races in Netlink
dumps.  However, the code remained more complicated than needed.  This
commit simplifies it.

The main reason for complication in the code was 'status_seq' in nl_dump.
This member was there to allow a thread to wait for some other thread to
refill the socket buffer with another dump message (although we did not
understand the reason at the time it was introduced).  Now that we know
that Netlink dumps properly need to be serialized to work in existing
Linux kernels, there's no additional value in having 'status_seq',
because serialized recvmsg() calls always refill the socket buffer
properly.

This commit updates nl_msg_next() to clear its buffer argument on error.
This is a more convenient interface for the new version of the Netlink
dump code.  nl_msg_next() doesn't have any other callers.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-16 09:39:49 -07:00
 								    return status == EOF ? 0 : status;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								}
 								/* Causes poll_block() to wake up when any of the specified 'events' (which is
 								 * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'. */
 								void
 								nl_sock_wait(const struct nl_sock *sock, short int events)
 								{
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    poll_fd_wait(sock->handle, events);
 								#else
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    poll_fd_wait(sock->fd, events);
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								}
-												netlink: Expose method to get Netlink pid of a socket.

In the future, the kernel will use unicast messages instead of
multicast to send upcalls.  As a result, we need to be able to
tell it where to direct the traffic.  This adds a function to expose
the Netlink pid of a socket so it can be included in messages to the
kernel.

											
										
										
											2011-09-16 09:37:16 -07:00
-												dpif-linux: Use poll() internally in dpif_linux_recv().

Using poll() internally in dpif_linux_recv(), instead of relying
on the results of the main loop poll() call, brings netperf CRR
performance back within 1% of par versus the code base before the
poll_fd_woke() optimizations were introduced.  It also increases
the ovs-benchmark results by about 5% versus that baseline, too.

My theory is that this is because the main loop takes long enough
that a significant number of packets can arrive during the main
loop itself, so this reduces the time before OVS gets to those
packets.

											
										
										
											2011-11-28 09:29:18 -08:00
+								/* Returns the underlying fd for 'sock', for use in "poll()"-like operations
 								 * that can't use nl_sock_wait().
 								 *
 								 * It's a little tricky to use the returned fd correctly, because nl_sock does
 								 * "copy on write" to allow a single nl_sock to be used for notifications,
 								 * transactions, and dumps.  If 'sock' is used only for notifications and
 								 * transactions (and never for dump) then the usage is safe. */
 								int
 								nl_sock_fd(const struct nl_sock *sock)
 								{
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#ifdef _WIN32
 								    return sock->handle;
 								#else
-												dpif-linux: Use poll() internally in dpif_linux_recv().

Using poll() internally in dpif_linux_recv(), instead of relying
on the results of the main loop poll() call, brings netperf CRR
performance back within 1% of par versus the code base before the
poll_fd_woke() optimizations were introduced.  It also increases
the ovs-benchmark results by about 5% versus that baseline, too.

My theory is that this is because the main loop takes long enough
that a significant number of packets can arrive during the main
loop itself, so this reduces the time before OVS gets to those
packets.

											
										
										
											2011-11-28 09:29:18 -08:00
+								    return sock->fd;
-												netlink-socket: Adapt to Windows and MSVC.

Add two functions set_sock_pid_in_kernel and portid_next. This will allow
the channel identification for the kernel extension to send back messages.

Replace send with WriteFile equivalent and ignore nl_sock_drain for the moment
under MSVC.

Replace sendmsg and recvmsg with ReadFile and WriteFile equivalents.

On MSVC put in handle instead of fd(sock->fd becomes sock->handle).

Creation of the netlink socket will be replaced by CreateFile equivalent.

Add MAX_STACK_LENGTH for MSVC.  This will be our maximum size for on-stack
copy buffer.

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 15:23:28 +00:00
+								#endif
-												dpif-linux: Use poll() internally in dpif_linux_recv().

Using poll() internally in dpif_linux_recv(), instead of relying
on the results of the main loop poll() call, brings netperf CRR
performance back within 1% of par versus the code base before the
poll_fd_woke() optimizations were introduced.  It also increases
the ovs-benchmark results by about 5% versus that baseline, too.

My theory is that this is because the main loop takes long enough
that a significant number of packets can arrive during the main
loop itself, so this reduces the time before OVS gets to those
packets.

											
										
										
											2011-11-28 09:29:18 -08:00
+								}
-												netlink: Expose method to get Netlink pid of a socket.

In the future, the kernel will use unicast messages instead of
multicast to send upcalls.  As a result, we need to be able to
tell it where to direct the traffic.  This adds a function to expose
the Netlink pid of a socket so it can be included in messages to the
kernel.

											
										
										
											2011-09-16 09:37:16 -07:00
+								/* Returns the PID associated with this socket. */
 								uint32_t
 								nl_sock_pid(const struct nl_sock *sock)
 								{
 								    return sock->pid;
 								}
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
 								/* Miscellaneous.  */
-												netlink-socket: Log Generic Netlink family names.

The ids for Generic Netlink family names aren't very helpful because they
can vary from machine to machine and even from one boot to the next.  So
this change logs their names too.

This only affects logging at DBG level.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-11 11:02:32 -08:00
+								struct genl_family {
 								    struct hmap_node hmap_node;
 								    uint16_t id;
 								    char *name;
 								};
 								static struct hmap genl_families = HMAP_INITIALIZER(&genl_families);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								static const struct nl_policy family_policy[CTRL_ATTR_MAX + 1] = {
 								    [CTRL_ATTR_FAMILY_ID] = {.type = NL_A_U16},
-												dpif-linux: Handle nl_lookup_genl_mcgroup() failures.

The nl_lookup_genl_mcgroup() function can fail on older kernels
which do not support the required netlink interface.  Before this
patch, dpif-linux would refuse to create a datapath when this
happened.  With this patch, it attempts to use a workaround.  If
the workaround fails it simply disables the affected features
without completely disabling the dpif.

											
										
										
											2011-09-12 18:57:50 -07:00
+								    [CTRL_ATTR_MCAST_GROUPS] = {.type = NL_A_NESTED, .optional = true},
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								};
-												netlink-socket: Log Generic Netlink family names.

The ids for Generic Netlink family names aren't very helpful because they
can vary from machine to machine and even from one boot to the next.  So
this change logs their names too.

This only affects logging at DBG level.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-11 11:02:32 -08:00
+								static struct genl_family *
 								find_genl_family_by_id(uint16_t id)
 								{
 								    struct genl_family *family;
 								    HMAP_FOR_EACH_IN_BUCKET (family, hmap_node, hash_int(id, 0),
 								                             &genl_families) {
 								        if (family->id == id) {
 								            return family;
 								        }
 								    }
 								    return NULL;
 								}
 								static void
 								define_genl_family(uint16_t id, const char *name)
 								{
 								    struct genl_family *family = find_genl_family_by_id(id);
 								    if (family) {
 								        if (!strcmp(family->name, name)) {
 								            return;
 								        }
 								        free(family->name);
 								    } else {
 								        family = xmalloc(sizeof *family);
 								        family->id = id;
 								        hmap_insert(&genl_families, &family->hmap_node, hash_int(id, 0));
 								    }
 								    family->name = xstrdup(name);
 								}
 								static const char *
 								genl_family_to_name(uint16_t id)
 								{
 								    if (id == GENL_ID_CTRL) {
 								        return "control";
 								    } else {
 								        struct genl_family *family = find_genl_family_by_id(id);
 								        return family ? family->name : "unknown";
 								    }
 								}
-												netlink-socket.c: add support for do_lookup_genl_family on Windows

In this patch, we add support for querying the genl family id for any
family supported by the OVS kernel datapath. On platforms that support
netlink natively, the operating system assigns a family ID, and the
OS netlink infrastructure supports querying the family ID by name.

In case of Windows, since OVS datpath provides the netlink support,
it is not necessary to make a call into the kernel. Returning a
family ID that is consistent between the userspace and kernel
is sufficient. Once there is code to support netlink message parsing
as well as constructing netlink messages, we can make a call into
the kernel, but that in itself may not buy anything more than this
approach.

This patch is a precursor to make progress of the other commands.
The next hurdle is to support nl_lookup_genl_mcgroup().

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 01:16:44 -07:00
+								#ifndef _WIN32
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								static int
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								do_lookup_genl_family(const char *name, struct nlattr **attrs,
 								                      struct ofpbuf **replyp)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
 								    struct nl_sock *sock;
 								    struct ofpbuf request, *reply;
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    int error;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    *replyp = NULL;
 								    error = nl_sock_create(NETLINK_GENERIC, &sock);
 								    if (error) {
 								        return error;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
 								    ofpbuf_init(&request, 0);
 								    nl_msg_put_genlmsghdr(&request, 0, GENL_ID_CTRL, NLM_F_REQUEST,
 								                          CTRL_CMD_GETFAMILY, 1);
 								    nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name);
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    error = nl_sock_transact(sock, &request, &reply);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    ofpbuf_uninit(&request);
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    if (error) {
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        nl_sock_destroy(sock);
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								        return error;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
 								    if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								                         family_policy, attrs, ARRAY_SIZE(family_policy))
 								        || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        nl_sock_destroy(sock);
 								        ofpbuf_delete(reply);
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								        return EPROTO;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
 								    nl_sock_destroy(sock);
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    *replyp = reply;
 								    return 0;
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								}
-												netlink-socket.c: add support for do_lookup_genl_family on Windows

In this patch, we add support for querying the genl family id for any
family supported by the OVS kernel datapath. On platforms that support
netlink natively, the operating system assigns a family ID, and the
OS netlink infrastructure supports querying the family ID by name.

In case of Windows, since OVS datpath provides the netlink support,
it is not necessary to make a call into the kernel. Returning a
family ID that is consistent between the userspace and kernel
is sufficient. Once there is code to support netlink message parsing
as well as constructing netlink messages, we can make a call into
the kernel, but that in itself may not buy anything more than this
approach.

This patch is a precursor to make progress of the other commands.
The next hurdle is to support nl_lookup_genl_mcgroup().

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 01:16:44 -07:00
+								#else
 								static int
 								do_lookup_genl_family(const char *name, struct nlattr **attrs,
 								                      struct ofpbuf **replyp)
 								{
 								    struct nl_sock *sock;
-												netlink-socket: add support for nl_lookup_genl_mcgroup()

While we work out whether nl_sock_join_mcgroup() will be the mechanism
to support VPORT events, it is easy to add support for
nl_lookup_genl_mcgroup() and make progress on the other commands.

In this patch, we implement support for nl_lookup_genl_mcgroup() only
for the VPORT family though, which is all what dpif-linux.c needs.

Validation:
- A ported dpif-linux.c with epoll code commented out went so far as
to call dp_enumerate! DP Dump commands can be implemented next.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:17:03 -07:00
+								    struct nlmsghdr *nlmsg;
-												netlink-socket.c: add support for do_lookup_genl_family on Windows

In this patch, we add support for querying the genl family id for any
family supported by the OVS kernel datapath. On platforms that support
netlink natively, the operating system assigns a family ID, and the
OS netlink infrastructure supports querying the family ID by name.

In case of Windows, since OVS datpath provides the netlink support,
it is not necessary to make a call into the kernel. Returning a
family ID that is consistent between the userspace and kernel
is sufficient. Once there is code to support netlink message parsing
as well as constructing netlink messages, we can make a call into
the kernel, but that in itself may not buy anything more than this
approach.

This patch is a precursor to make progress of the other commands.
The next hurdle is to support nl_lookup_genl_mcgroup().

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 01:16:44 -07:00
+								    struct ofpbuf *reply;
 								    int error;
 								    uint16_t family_id;
 								    const char *family_name;
 								    uint32_t family_version;
 								    uint32_t family_attrmax;
-												netlink-socket: add support for nl_lookup_genl_mcgroup()

While we work out whether nl_sock_join_mcgroup() will be the mechanism
to support VPORT events, it is easy to add support for
nl_lookup_genl_mcgroup() and make progress on the other commands.

In this patch, we implement support for nl_lookup_genl_mcgroup() only
for the VPORT family though, which is all what dpif-linux.c needs.

Validation:
- A ported dpif-linux.c with epoll code commented out went so far as
to call dp_enumerate! DP Dump commands can be implemented next.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:17:03 -07:00
+								    uint32_t mcgrp_id = OVS_WIN_NL_INVALID_MCGRP_ID;
 								    const char *mcgrp_name = NULL;
-												netlink-socket.c: add support for do_lookup_genl_family on Windows

In this patch, we add support for querying the genl family id for any
family supported by the OVS kernel datapath. On platforms that support
netlink natively, the operating system assigns a family ID, and the
OS netlink infrastructure supports querying the family ID by name.

In case of Windows, since OVS datpath provides the netlink support,
it is not necessary to make a call into the kernel. Returning a
family ID that is consistent between the userspace and kernel
is sufficient. Once there is code to support netlink message parsing
as well as constructing netlink messages, we can make a call into
the kernel, but that in itself may not buy anything more than this
approach.

This patch is a precursor to make progress of the other commands.
The next hurdle is to support nl_lookup_genl_mcgroup().

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 01:16:44 -07:00
 								    *replyp = NULL;
 								    reply = ofpbuf_new(1024);
-												netlink-socket: add support for nl_lookup_genl_mcgroup()

While we work out whether nl_sock_join_mcgroup() will be the mechanism
to support VPORT events, it is easy to add support for
nl_lookup_genl_mcgroup() and make progress on the other commands.

In this patch, we implement support for nl_lookup_genl_mcgroup() only
for the VPORT family though, which is all what dpif-linux.c needs.

Validation:
- A ported dpif-linux.c with epoll code commented out went so far as
to call dp_enumerate! DP Dump commands can be implemented next.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:17:03 -07:00
+								    /* CTRL_ATTR_MCAST_GROUPS is supported only for VPORT family. */
-												netlink-socket.c: add support for do_lookup_genl_family on Windows

In this patch, we add support for querying the genl family id for any
family supported by the OVS kernel datapath. On platforms that support
netlink natively, the operating system assigns a family ID, and the
OS netlink infrastructure supports querying the family ID by name.

In case of Windows, since OVS datpath provides the netlink support,
it is not necessary to make a call into the kernel. Returning a
family ID that is consistent between the userspace and kernel
is sufficient. Once there is code to support netlink message parsing
as well as constructing netlink messages, we can make a call into
the kernel, but that in itself may not buy anything more than this
approach.

This patch is a precursor to make progress of the other commands.
The next hurdle is to support nl_lookup_genl_mcgroup().

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 01:16:44 -07:00
+								    if (!strcmp(name, OVS_WIN_CONTROL_FAMILY)) {
 								        family_id = OVS_WIN_NL_CTRL_FAMILY_ID;
 								        family_name = OVS_WIN_CONTROL_FAMILY;
 								        family_version = OVS_WIN_CONTROL_VERSION;
 								        family_attrmax = OVS_WIN_CONTROL_ATTR_MAX;
 								    } else if (!strcmp(name, OVS_DATAPATH_FAMILY)) {
 								        family_id = OVS_WIN_NL_DATAPATH_FAMILY_ID;
 								        family_name = OVS_DATAPATH_FAMILY;
 								        family_version = OVS_DATAPATH_VERSION;
 								        family_attrmax = OVS_DP_ATTR_MAX;
 								    } else if (!strcmp(name, OVS_PACKET_FAMILY)) {
 								        family_id = OVS_WIN_NL_PACKET_FAMILY_ID;
 								        family_name = OVS_PACKET_FAMILY;
 								        family_version = OVS_PACKET_VERSION;
 								        family_attrmax = OVS_PACKET_ATTR_MAX;
 								    } else if (!strcmp(name, OVS_VPORT_FAMILY)) {
 								        family_id = OVS_WIN_NL_VPORT_FAMILY_ID;
 								        family_name = OVS_VPORT_FAMILY;
 								        family_version = OVS_VPORT_VERSION;
 								        family_attrmax = OVS_VPORT_ATTR_MAX;
-												netlink-socket: add support for nl_lookup_genl_mcgroup()

While we work out whether nl_sock_join_mcgroup() will be the mechanism
to support VPORT events, it is easy to add support for
nl_lookup_genl_mcgroup() and make progress on the other commands.

In this patch, we implement support for nl_lookup_genl_mcgroup() only
for the VPORT family though, which is all what dpif-linux.c needs.

Validation:
- A ported dpif-linux.c with epoll code commented out went so far as
to call dp_enumerate! DP Dump commands can be implemented next.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:17:03 -07:00
+								        mcgrp_id = OVS_WIN_NL_VPORT_MCGRP_ID;
 								        mcgrp_name = OVS_VPORT_MCGROUP;
-												netlink-socket.c: add support for do_lookup_genl_family on Windows

In this patch, we add support for querying the genl family id for any
family supported by the OVS kernel datapath. On platforms that support
netlink natively, the operating system assigns a family ID, and the
OS netlink infrastructure supports querying the family ID by name.

In case of Windows, since OVS datpath provides the netlink support,
it is not necessary to make a call into the kernel. Returning a
family ID that is consistent between the userspace and kernel
is sufficient. Once there is code to support netlink message parsing
as well as constructing netlink messages, we can make a call into
the kernel, but that in itself may not buy anything more than this
approach.

This patch is a precursor to make progress of the other commands.
The next hurdle is to support nl_lookup_genl_mcgroup().

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 01:16:44 -07:00
+								    } else if (!strcmp(name, OVS_FLOW_FAMILY)) {
 								        family_id = OVS_WIN_NL_FLOW_FAMILY_ID;
 								        family_name = OVS_FLOW_FAMILY;
 								        family_version = OVS_FLOW_VERSION;
 								        family_attrmax = OVS_FLOW_ATTR_MAX;
 								    } else {
 								        ofpbuf_delete(reply);
 								        return EINVAL;
 								    }
 								    nl_msg_put_genlmsghdr(reply, 0, GENL_ID_CTRL, 0,
 								                          CTRL_CMD_NEWFAMILY, family_version);
 								    /* CTRL_ATTR_HDRSIZE and CTRL_ATTR_OPS are not populated, but the
 								     * callers do not seem to need them. */
 								    nl_msg_put_u16(reply, CTRL_ATTR_FAMILY_ID, family_id);
 								    nl_msg_put_string(reply, CTRL_ATTR_FAMILY_NAME, family_name);
 								    nl_msg_put_u32(reply, CTRL_ATTR_VERSION, family_version);
 								    nl_msg_put_u32(reply, CTRL_ATTR_MAXATTR, family_attrmax);
-												netlink-socket: add support for nl_lookup_genl_mcgroup()

While we work out whether nl_sock_join_mcgroup() will be the mechanism
to support VPORT events, it is easy to add support for
nl_lookup_genl_mcgroup() and make progress on the other commands.

In this patch, we implement support for nl_lookup_genl_mcgroup() only
for the VPORT family though, which is all what dpif-linux.c needs.

Validation:
- A ported dpif-linux.c with epoll code commented out went so far as
to call dp_enumerate! DP Dump commands can be implemented next.

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-26 20:17:03 -07:00
+								    if (mcgrp_id != OVS_WIN_NL_INVALID_MCGRP_ID) {
 								        size_t mcgrp_ofs1 = nl_msg_start_nested(reply, CTRL_ATTR_MCAST_GROUPS);
 								        size_t mcgrp_ofs2= nl_msg_start_nested(reply,
 								            OVS_WIN_NL_VPORT_MCGRP_ID - OVS_WIN_NL_MCGRP_START_ID);
 								        nl_msg_put_u32(reply, CTRL_ATTR_MCAST_GRP_ID, mcgrp_id);
 								        ovs_assert(mcgrp_name != NULL);
 								        nl_msg_put_string(reply, CTRL_ATTR_MCAST_GRP_NAME, mcgrp_name);
 								        nl_msg_end_nested(reply, mcgrp_ofs2);
 								        nl_msg_end_nested(reply, mcgrp_ofs1);
 								    }
 								    /* Set the total length of the netlink message. */
 								    nlmsg = nl_msg_nlmsghdr(reply);
 								    nlmsg->nlmsg_len = ofpbuf_size(reply);
-												netlink-socket.c: add support for do_lookup_genl_family on Windows

In this patch, we add support for querying the genl family id for any
family supported by the OVS kernel datapath. On platforms that support
netlink natively, the operating system assigns a family ID, and the
OS netlink infrastructure supports querying the family ID by name.

In case of Windows, since OVS datpath provides the netlink support,
it is not necessary to make a call into the kernel. Returning a
family ID that is consistent between the userspace and kernel
is sufficient. Once there is code to support netlink message parsing
as well as constructing netlink messages, we can make a call into
the kernel, but that in itself may not buy anything more than this
approach.

This patch is a precursor to make progress of the other commands.
The next hurdle is to support nl_lookup_genl_mcgroup().

Signed-off-by: Nithin Raju <nithin@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 01:16:44 -07:00
+								    if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
 								                         family_policy, attrs, ARRAY_SIZE(family_policy))
 								        || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
 								        nl_sock_destroy(sock);
 								        ofpbuf_delete(reply);
 								        return EPROTO;
 								    }
 								    *replyp = reply;
 								    return 0;
 								}
 								#endif
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								/* Finds the multicast group called 'group_name' in genl family 'family_name'.
 								 * When successful, writes its result to 'multicast_group' and returns 0.
-												dpif-linux: Handle nl_lookup_genl_mcgroup() failures.

The nl_lookup_genl_mcgroup() function can fail on older kernels
which do not support the required netlink interface.  Before this
patch, dpif-linux would refuse to create a datapath when this
happened.  With this patch, it attempts to use a workaround.  If
the workaround fails it simply disables the affected features
without completely disabling the dpif.

											
										
										
											2011-09-12 18:57:50 -07:00
+								 * Otherwise, clears 'multicast_group' and returns a positive error code.
-												datapath: Cleanup netlink compat code.

Patch removes genl, netlink, rtnl compat code and dpif-linux
fallback-id compat code.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2013-08-26 23:53:17 -07:00
+								 */
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								int
 								nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
-												datapath: Cleanup netlink compat code.

Patch removes genl, netlink, rtnl compat code and dpif-linux
fallback-id compat code.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2013-08-26 23:53:17 -07:00
+								                       unsigned int *multicast_group)
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								{
 								    struct nlattr *family_attrs[ARRAY_SIZE(family_policy)];
-												netlink: New macros NL_NESTED_FOR_EACH, NL_NESTED_FOR_EACH_UNSAFE.

Upcoming commits will introduce more users.

											
										
										
											2011-10-05 09:36:11 -07:00
+								    const struct nlattr *mc;
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    struct ofpbuf *reply;
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								    unsigned int left;
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    int error;
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
 								    *multicast_group = 0;
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    error = do_lookup_genl_family(family_name, family_attrs, &reply);
 								    if (error) {
 								        return error;
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								    }
-												dpif-linux: Handle nl_lookup_genl_mcgroup() failures.

The nl_lookup_genl_mcgroup() function can fail on older kernels
which do not support the required netlink interface.  Before this
patch, dpif-linux would refuse to create a datapath when this
happened.  With this patch, it attempts to use a workaround.  If
the workaround fails it simply disables the affected features
without completely disabling the dpif.

											
										
										
											2011-09-12 18:57:50 -07:00
+								    if (!family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
-												datapath: Cleanup netlink compat code.

Patch removes genl, netlink, rtnl compat code and dpif-linux
fallback-id compat code.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2013-08-26 23:53:17 -07:00
+								        error = EPROTO;
-												dpif-linux: Handle nl_lookup_genl_mcgroup() failures.

The nl_lookup_genl_mcgroup() function can fail on older kernels
which do not support the required netlink interface.  Before this
patch, dpif-linux would refuse to create a datapath when this
happened.  With this patch, it attempts to use a workaround.  If
the workaround fails it simply disables the affected features
without completely disabling the dpif.

											
										
										
											2011-09-12 18:57:50 -07:00
+								        goto exit;
 								    }
-												netlink: New macros NL_NESTED_FOR_EACH, NL_NESTED_FOR_EACH_UNSAFE.

Upcoming commits will introduce more users.

											
										
										
											2011-10-05 09:36:11 -07:00
+								    NL_NESTED_FOR_EACH (mc, left, family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								        static const struct nl_policy mc_policy[] = {
 								            [CTRL_ATTR_MCAST_GRP_ID] = {.type = NL_A_U32},
 								            [CTRL_ATTR_MCAST_GRP_NAME] = {.type = NL_A_STRING},
 								        };
 								        struct nlattr *mc_attrs[ARRAY_SIZE(mc_policy)];
 								        const char *mc_name;
 								        if (!nl_parse_nested(mc, mc_policy, mc_attrs, ARRAY_SIZE(mc_policy))) {
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								            error = EPROTO;
 								            goto exit;
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								        }
 								        mc_name = nl_attr_get_string(mc_attrs[CTRL_ATTR_MCAST_GRP_NAME]);
 								        if (!strcmp(group_name, mc_name)) {
 								            *multicast_group =
 								                nl_attr_get_u32(mc_attrs[CTRL_ATTR_MCAST_GRP_ID]);
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								            error = 0;
 								            goto exit;
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								        }
 								    }
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								    error = EPROTO;
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								exit:
 								    ofpbuf_delete(reply);
 								    return error;
-												netlink-socket: New function nl_lookup_genl_mcgroup().

											
										
										
											2011-08-23 13:13:34 -07:00
+								}
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								/* If '*number' is 0, translates the given Generic Netlink family 'name' to a
 								 * number and stores it in '*number'.  If successful, returns 0 and the caller
 								 * may use '*number' as the family number.  On failure, returns a positive
 								 * errno value and '*number' caches the errno value. */
 								int
 								nl_lookup_genl_family(const char *name, int *number)
 								{
 								    if (*number == 0) {
-												netlink-socket: Avoid use-after-free in nl_lookup_genl_mcgroup().

Commit e408762f "netlink-socket: New function nl_lookup_genl_mcgroup()"
modified do_lookup_genl_family() to return the Netlink attributes to the
caller, but it still freed the Netlink message itself, which meant that
the attributes pointed into freed memory.  This commit fixes the problem.

This commit is not a minimal fix.  It refactors do_lookup_genl_family(),
changing the return value from "negative errno value or positive genl
family id" to the more common "zero or positive errno value".

Found by valgrind.

											
										
										
											2011-09-09 10:21:49 -07:00
+								        struct nlattr *attrs[ARRAY_SIZE(family_policy)];
 								        struct ofpbuf *reply;
 								        int error;
 								        error = do_lookup_genl_family(name, attrs, &reply);
 								        if (!error) {
 								            *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
 								            define_genl_family(*number, name);
 								        } else {
 								            *number = -error;
 								        }
 								        ofpbuf_delete(reply);
-												Replace most uses of assert by ovs_assert.

This is a straight search-and-replace, except that I also removed #include
<assert.h> from each file where there were no assert calls left.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-06 13:14:55 -08:00
+								        ovs_assert(*number != 0);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    }
 								    return *number > 0 ? 0 : -*number;
 								}
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
 								struct nl_pool {
 								    struct nl_sock *socks[16];
 								    int n;
 								};
-												Use "error-checking" mutexes in place of other kinds wherever possible.

We've seen a number of deadlocks in the tree since thread safety was
introduced.  So far, all of these are self-deadlocks, that is, a single
thread acquiring a lock and then attempting to re-acquire the same lock
recursively.  When this has happened, the process simply hung, and it was
somewhat difficult to find the cause.

POSIX "error-checking" mutexes check for this specific problem (and
others).  This commit switches from other types of mutexes to
error-checking mutexes everywhere that we can, that is, everywhere that
we're not using recursive mutexes.  This ought to help find problems more
quickly in the future.

There might be performance advantages to other kinds of mutexes in some
cases.  However, the existing mutex type choices were just guesses, so I'd
rather go for easy detection of errors until we know that other mutex
types actually perform better in specific cases.  Also, I did a quick
microbenchmark of glibc mutex types on my host and found that the
error checking mutexes weren't any slower than the other types, at least
when the mutex is uncontended.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-08-20 13:40:02 -07:00
+								static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
 								static int
 								nl_pool_alloc(int protocol, struct nl_sock **sockp)
 								{
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
+								    struct nl_sock *sock = NULL;
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								    struct nl_pool *pool;
 								    ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_lock(&pool_mutex);
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								    pool = &pools[protocol];
 								    if (pool->n > 0) {
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
+								        sock = pool->socks[--pool->n];
 								    }
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_unlock(&pool_mutex);
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
 								    if (sock) {
 								        *sockp = sock;
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								        return 0;
 								    } else {
 								        return nl_sock_create(protocol, sockp);
 								    }
 								}
 								static void
 								nl_pool_release(struct nl_sock *sock)
 								{
 								    if (sock) {
 								        struct nl_pool *pool = &pools[sock->protocol];
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								        ovs_mutex_lock(&pool_mutex);
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								        if (pool->n < ARRAY_SIZE(pool->socks)) {
 								            pool->socks[pool->n++] = sock;
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
+								            sock = NULL;
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								        }
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								        ovs_mutex_unlock(&pool_mutex);
-												netlink-socket: Make thread-safe.

The uses of vlog in this module are not thread-safe, because vlog itself
is not yet thread-safe.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 11:39:11 -07:00
 								        nl_sock_destroy(sock);
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								    }
 								}
-												netlink-socket: Add conceptual documentation.

Based on a conversation with the VMware Hyper-V team earlier today.

This commit also changes a couple of functions that were only used with
netlink-socket.c into static functions.  I couldn't think of a reason for
code outside that file to use them.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 08:59:40 -07:00
+								/* Sends 'request' to the kernel on a Netlink socket for the given 'protocol'
 								 * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response.  If
 								 * successful, returns 0.  On failure, returns a positive errno value.
 								 *
 								 * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
 								 * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
 								 * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the kernel's
 								 * reply, if any, is discarded.
 								 *
 								 * Before the message is sent, nlmsg_len in 'request' will be finalized to
 								 * match ofpbuf_size(msg), nlmsg_pid will be set to the pid of the socket used
 								 * for sending the request, and nlmsg_seq will be initialized.
 								 *
 								 * The caller is responsible for destroying 'request'.
 								 *
 								 * Bare Netlink is an unreliable transport protocol.  This function layers
 								 * reliable delivery and reply semantics on top of bare Netlink.
 								 *
 								 * In Netlink, sending a request to the kernel is reliable enough, because the
 								 * kernel will tell us if the message cannot be queued (and we will in that
 								 * case put it on the transmit queue and wait until it can be delivered).
 								 *
 								 * Receiving the reply is the real problem: if the socket buffer is full when
 								 * the kernel tries to send the reply, the reply will be dropped.  However, the
 								 * kernel sets a flag that a reply has been dropped.  The next call to recv
 								 * then returns ENOBUFS.  We can then re-send the request.
 								 *
 								 * Caveats:
 								 *
 								 *      1. Netlink depends on sequence numbers to match up requests and
 								 *         replies.  The sender of a request supplies a sequence number, and
 								 *         the reply echos back that sequence number.
 								 *
 								 *         This is fine, but (1) some kernel netlink implementations are
 								 *         broken, in that they fail to echo sequence numbers and (2) this
 								 *         function will drop packets with non-matching sequence numbers, so
 								 *         that only a single request can be usefully transacted at a time.
 								 *
 								 *      2. Resending the request causes it to be re-executed, so the request
 								 *         needs to be idempotent.
 								 */
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								int
 								nl_transact(int protocol, const struct ofpbuf *request,
 								            struct ofpbuf **replyp)
 								{
 								    struct nl_sock *sock;
 								    int error;
 								    error = nl_pool_alloc(protocol, &sock);
 								    if (error) {
 								        *replyp = NULL;
 								        return error;
 								    }
 								    error = nl_sock_transact(sock, request, replyp);
 								    nl_pool_release(sock);
 								    return error;
 								}
-												netlink-socket: Add conceptual documentation.

Based on a conversation with the VMware Hyper-V team earlier today.

This commit also changes a couple of functions that were only used with
netlink-socket.c into static functions.  I couldn't think of a reason for
code outside that file to use them.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-29 08:59:40 -07:00
+								/* Sends the 'request' member of the 'n' transactions in 'transactions' on a
 								 * Netlink socket for the given 'protocol' (e.g. NETLINK_ROUTE or
 								 * NETLINK_GENERIC), in order, and receives responses to all of them.  Fills in
 								 * the 'error' member of each transaction with 0 if it was successful,
 								 * otherwise with a positive errno value.  If 'reply' is nonnull, then it will
 								 * be filled with the reply if the message receives a detailed reply.  In other
 								 * cases, i.e. where the request failed or had no reply beyond an indication of
 								 * success, 'reply' will be cleared if it is nonnull.
 								 *
 								 * The caller is responsible for destroying each request and reply, and the
 								 * transactions array itself.
 								 *
 								 * Before sending each message, this function will finalize nlmsg_len in each
 								 * 'request' to match the ofpbuf's size, set nlmsg_pid to the pid of the socket
 								 * used for the transaction, and initialize nlmsg_seq.
 								 *
 								 * Bare Netlink is an unreliable transport protocol.  This function layers
 								 * reliable delivery and reply semantics on top of bare Netlink.  See
 								 * nl_transact() for some caveats.
 								 */
-												netlink-socket: Simplify use of transactions and dumps.

This disentangles "struct nl_dump" from "struct nl_sock", clearing the way
to make the use of either one thread-safe in an obviously correct manner.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-29 13:57:50 -07:00
+								void
 								nl_transact_multiple(int protocol,
 								                     struct nl_transaction **transactions, size_t n)
 								{
 								    struct nl_sock *sock;
 								    int error;
 								    error = nl_pool_alloc(protocol, &sock);
 								    if (!error) {
 								        nl_sock_transact_multiple(sock, transactions, n);
 								        nl_pool_release(sock);
 								    } else {
 								        nl_sock_record_errors__(transactions, n, error);
 								    }
 								}
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
-												netlink: Postpone choosing sequence numbers until send time.

Choosing sequence numbers at time of creating a packet means that
nl_sock_transact_multiple() has to search for the sequence number
of a reply, because the sequence numbers of the requests aren't
necessarily sequential.  This commit makes it possible to avoid
the search, by deferring choice of sequence numbers until the
time that we send the packets.  It doesn't actually modify
nl_sock_transact_multiple(), which will happen in a later commit.

Previously, I was concerned about a theoretical race condition
described in a comment in the old versino of this code:

    This implementation uses sequence numbers that are unique
    process-wide, to avoid a hypothetical race: send request, close
    socket, open new socket that reuses the old socket's PID value,
    send request on new socket, receive reply from kernel to old
    socket but with same PID and sequence number.  (This race could be
    avoided other ways, e.g. by preventing PIDs from being quickly
    reused).

However, I no longer believe that this can be a real problem,
because Netlink operates synchronously.  The reply to a request
will always arrive before the socket can be closed and a new
socket opened with the old socket's PID.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-16 16:01:01 -07:00
+								static uint32_t
 								nl_sock_allocate_seq(struct nl_sock *sock, unsigned int n)
 								{
 								    uint32_t seq = sock->next_seq;
 								    sock->next_seq += n;
 								    /* Make it impossible for the next request for sequence numbers to wrap
 								     * around to 0.  Start over with 1 to avoid ever using a sequence number of
 								     * 0, because the kernel uses sequence number 0 for notifications. */
 								    if (sock->next_seq >= UINT32_MAX / 2) {
 								        sock->next_seq = 1;
 								    }
 								    return seq;
 								}
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								static void
-												netlink-socket: Log Generic Netlink family names.

The ids for Generic Netlink family names aren't very helpful because they
can vary from machine to machine and even from one boot to the next.  So
this change logs their names too.

This only affects logging at DBG level.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-11 11:02:32 -08:00
+								nlmsghdr_to_string(const struct nlmsghdr *h, int protocol, struct ds *ds)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
 								    struct nlmsg_flag {
 								        unsigned int bits;
 								        const char *name;
 								    };
 								    static const struct nlmsg_flag flags[] = {
 								        { NLM_F_REQUEST, "REQUEST" },
 								        { NLM_F_MULTI, "MULTI" },
 								        { NLM_F_ACK, "ACK" },
 								        { NLM_F_ECHO, "ECHO" },
 								        { NLM_F_DUMP, "DUMP" },
 								        { NLM_F_ROOT, "ROOT" },
 								        { NLM_F_MATCH, "MATCH" },
 								        { NLM_F_ATOMIC, "ATOMIC" },
 								    };
 								    const struct nlmsg_flag *flag;
 								    uint16_t flags_left;
 								    ds_put_format(ds, "nl(len:%"PRIu32", type=%"PRIu16,
 								                  h->nlmsg_len, h->nlmsg_type);
 								    if (h->nlmsg_type == NLMSG_NOOP) {
 								        ds_put_cstr(ds, "(no-op)");
 								    } else if (h->nlmsg_type == NLMSG_ERROR) {
 								        ds_put_cstr(ds, "(error)");
 								    } else if (h->nlmsg_type == NLMSG_DONE) {
 								        ds_put_cstr(ds, "(done)");
 								    } else if (h->nlmsg_type == NLMSG_OVERRUN) {
 								        ds_put_cstr(ds, "(overrun)");
 								    } else if (h->nlmsg_type < NLMSG_MIN_TYPE) {
 								        ds_put_cstr(ds, "(reserved)");
-												netlink-socket: Log Generic Netlink family names.

The ids for Generic Netlink family names aren't very helpful because they
can vary from machine to machine and even from one boot to the next.  So
this change logs their names too.

This only affects logging at DBG level.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-11 11:02:32 -08:00
+								    } else if (protocol == NETLINK_GENERIC) {
 								        ds_put_format(ds, "(%s)", genl_family_to_name(h->nlmsg_type));
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    } else {
 								        ds_put_cstr(ds, "(family-defined)");
 								    }
 								    ds_put_format(ds, ", flags=%"PRIx16, h->nlmsg_flags);
 								    flags_left = h->nlmsg_flags;
 								    for (flag = flags; flag < &flags[ARRAY_SIZE(flags)]; flag++) {
 								        if ((flags_left & flag->bits) == flag->bits) {
 								            ds_put_format(ds, "[%s]", flag->name);
 								            flags_left &= ~flag->bits;
 								        }
 								    }
 								    if (flags_left) {
 								        ds_put_format(ds, "[OTHER:%"PRIx16"]", flags_left);
 								    }
-												netlink-socket: Let the kernel choose Netlink pids for us.

The Netlink code in the Linux kernel has been willing to choose unique
Netlink pids for userspace sockets since at least 2.4.36 and probably
earlier.  There's no value in choosing them ourselves.

This simplifies the code and eliminates the possibility of exhausting our
supply of Netlink PIDs.

											
										
										
											2011-11-14 10:10:58 -08:00
+								    ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32,
 								                  h->nlmsg_seq, h->nlmsg_pid);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								}
 								static char *
-												netlink-socket: Slightly improve logging of Generic Netlink messages.

This makes the stream of requests and replies very slightly easier to
understand.

Reviewed by Justin Pettit.

											
										
										
											2011-01-18 14:07:52 -08:00
+								nlmsg_to_string(const struct ofpbuf *buffer, int protocol)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
 								    struct ds ds = DS_EMPTY_INITIALIZER;
 								    const struct nlmsghdr *h = ofpbuf_at(buffer, 0, NLMSG_HDRLEN);
 								    if (h) {
-												netlink-socket: Log Generic Netlink family names.

The ids for Generic Netlink family names aren't very helpful because they
can vary from machine to machine and even from one boot to the next.  So
this change logs their names too.

This only affects logging at DBG level.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-11 11:02:32 -08:00
+								        nlmsghdr_to_string(h, protocol, &ds);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        if (h->nlmsg_type == NLMSG_ERROR) {
 								            const struct nlmsgerr *e;
 								            e = ofpbuf_at(buffer, NLMSG_HDRLEN,
 								                          NLMSG_ALIGN(sizeof(struct nlmsgerr)));
 								            if (e) {
 								                ds_put_format(&ds, " error(%d", e->error);
 								                if (e->error < 0) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                    ds_put_format(&ds, "(%s)", ovs_strerror(-e->error));
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								                }
 								                ds_put_cstr(&ds, ", in-reply-to(");
-												netlink-socket: Log Generic Netlink family names.

The ids for Generic Netlink family names aren't very helpful because they
can vary from machine to machine and even from one boot to the next.  So
this change logs their names too.

This only affects logging at DBG level.

Reviewed by Ethan Jackson <ethan@nicira.com>.

											
										
										
											2011-01-11 11:02:32 -08:00
+								                nlmsghdr_to_string(&e->msg, protocol, &ds);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								                ds_put_cstr(&ds, "))");
 								            } else {
 								                ds_put_cstr(&ds, " error(truncated)");
 								            }
 								        } else if (h->nlmsg_type == NLMSG_DONE) {
 								            int *error = ofpbuf_at(buffer, NLMSG_HDRLEN, sizeof *error);
 								            if (error) {
 								                ds_put_format(&ds, " done(%d", *error);
 								                if (*error < 0) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                    ds_put_format(&ds, "(%s)", ovs_strerror(-*error));
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								                }
 								                ds_put_cstr(&ds, ")");
 								            } else {
 								                ds_put_cstr(&ds, " done(truncated)");
 								            }
-												netlink-socket: Slightly improve logging of Generic Netlink messages.

This makes the stream of requests and replies very slightly easier to
understand.

Reviewed by Justin Pettit.

											
										
										
											2011-01-18 14:07:52 -08:00
+								        } else if (protocol == NETLINK_GENERIC) {
 								            struct genlmsghdr *genl = nl_msg_genlmsghdr(buffer);
 								            if (genl) {
 								                ds_put_format(&ds, ",genl(cmd=%"PRIu8",version=%"PRIu8")",
 								                              genl->cmd, genl->version);
 								            }
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								        }
 								    } else {
 								        ds_put_cstr(&ds, "nl(truncated)");
 								    }
 								    return ds.string;
 								}
 								static void
 								log_nlmsg(const char *function, int error,
-												netlink-socket: Slightly improve logging of Generic Netlink messages.

This makes the stream of requests and replies very slightly easier to
understand.

Reviewed by Justin Pettit.

											
										
										
											2011-01-18 14:07:52 -08:00
+								          const void *message, size_t size, int protocol)
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								{
 								    struct ofpbuf buffer;
 								    char *nlmsg;
 								    if (!VLOG_IS_DBG_ENABLED()) {
 								        return;
 								    }
 								    ofpbuf_use_const(&buffer, message, size);
-												netlink-socket: Slightly improve logging of Generic Netlink messages.

This makes the stream of requests and replies very slightly easier to
understand.

Reviewed by Justin Pettit.

											
										
										
											2011-01-18 14:07:52 -08:00
+								    nlmsg = nlmsg_to_string(&buffer, protocol);
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								    VLOG_DBG_RL(&rl, "%s (%s): %s", function, ovs_strerror(error), nlmsg);
-												netlink: Split into generic and Linux-specific parts.

The parts of the netlink module that are related to sockets are
Linux-specific, since only Linux has AF_NETLINK sockets.  The rest can be
built anywhere.  This commit breaks them into two modules, and builds the
generic one on all platforms.

Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 09:51:03 -08:00
+								    free(nlmsg);
 								}