mirror of
https://github.com/openvswitch/ovs
synced 2025-10-27 15:18:06 +00:00
netlink-socket: Work around kernel Netlink dump thread races.
The Linux kernel Netlink implementation has two races that cause problems
for processes that attempt to dump a table in a multithreaded manner.
The first race is in the structure of the kernel netlink_recv() function.
This function pulls a message from the socket queue and, if there is none,
reports EAGAIN:
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (skb == NULL)
goto out;
Only if a message is successfully read from the socket queue does the
function, toward the end, try to queue up a new message to be dumped:
if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
ret = netlink_dump(sk);
if (ret) {
sk->sk_err = ret;
sk->sk_error_report(sk);
}
}
This means that if thread A reads a message from a dump, then thread B
attempts to read one before A queues up the next, B will get EAGAIN. This
means that, following EAGAIN, B needs to wait until A returns to userspace
before it tries to read the socket again. nl_dump_next() already does
this, using 'dump->status_seq' (although the need for it has never been
explained clearly, to my knowledge).
The second race is more serious. Suppose thread X and thread Y both
simultaneously attempt to queue up a new message to be dumped, using the
call to netlink_dump() quoted above. netlink_dump() begins with:
mutex_lock(nlk->cb_mutex);
cb = nlk->cb;
if (cb == NULL) {
err = -EINVAL;
goto errout_skb;
}
Suppose that X gets cb_mutex first and finds that the dump is complete. It
will therefore, toward the end of netlink_dump(), clear nlk->cb to NULL to
indicate that no dump is in progress and release the mutex:
nlk->cb = NULL;
mutex_unlock(nlk->cb_mutex);
When Y grabs cb_mutex afterward, it will see that nlk->cb is NULL and
return -EINVAL as quoted above. netlink_recv() stuffs -EINVAL in sk_err,
but that error is not reported immediately; instead, it is saved for the
next read from the socket. Since Open vSwitch maintains a pool of Netlink
sockets, that next failure can crop up pretty much anywhere. One of the
worst places for it to crop up is in the execution of a later transaction
(e.g. in nl_sock_transact_multiple__()), because userspace treats Netlink
transactions as idempotent and will re-execute them when socket errors
occur. For a transaction that sends a packet, this causes packet
duplication, which we actually observed in practice. (ENOBUFS should
actually cause transactions to be re-executed in many cases, but EINVAL
should not; this is a separate bug in the userspace netlink code.)
VMware-BZ: #1283188
Reported-and-tested-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>
This commit is contained in:
@@ -714,6 +714,7 @@ nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
|
|||||||
atomic_init(&dump->status, status << 1);
|
atomic_init(&dump->status, status << 1);
|
||||||
dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
|
dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
|
||||||
dump->status_seq = seq_create();
|
dump->status_seq = seq_create();
|
||||||
|
ovs_mutex_init(&dump->mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
|
/* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
|
||||||
@@ -756,7 +757,15 @@ nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Take the mutex here to avoid an in-kernel race. If two threads try
|
||||||
|
* to read from a Netlink dump socket at once, then the socket error
|
||||||
|
* can be set to EINVAL, which will be encountered on the next recv on
|
||||||
|
* that socket, which could be anywhere due to the way that we pool
|
||||||
|
* Netlink sockets. Serializing the recv calls avoids the issue. */
|
||||||
|
ovs_mutex_lock(&dump->mutex);
|
||||||
retval = nl_sock_recv__(dump->sock, buffer, false);
|
retval = nl_sock_recv__(dump->sock, buffer, false);
|
||||||
|
ovs_mutex_unlock(&dump->mutex);
|
||||||
|
|
||||||
if (retval) {
|
if (retval) {
|
||||||
ofpbuf_clear(buffer);
|
ofpbuf_clear(buffer);
|
||||||
if (retval == EAGAIN) {
|
if (retval == EAGAIN) {
|
||||||
@@ -835,6 +844,7 @@ nl_dump_done(struct nl_dump *dump)
|
|||||||
}
|
}
|
||||||
nl_pool_release(dump->sock);
|
nl_pool_release(dump->sock);
|
||||||
seq_destroy(dump->status_seq);
|
seq_destroy(dump->status_seq);
|
||||||
|
ovs_mutex_destroy(&dump->mutex);
|
||||||
return status >> 1;
|
return status >> 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -52,6 +52,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "ofpbuf.h"
|
#include "ofpbuf.h"
|
||||||
#include "ovs-atomic.h"
|
#include "ovs-atomic.h"
|
||||||
|
#include "ovs-thread.h"
|
||||||
|
|
||||||
struct nl_sock;
|
struct nl_sock;
|
||||||
|
|
||||||
@@ -114,6 +115,7 @@ struct nl_dump {
|
|||||||
atomic_uint status; /* Low bit set if we read final message.
|
atomic_uint status; /* Low bit set if we read final message.
|
||||||
* Other bits hold an errno (0 for success). */
|
* Other bits hold an errno (0 for success). */
|
||||||
struct seq *status_seq; /* Tracks changes to the above 'status'. */
|
struct seq *status_seq; /* Tracks changes to the above 'status'. */
|
||||||
|
struct ovs_mutex mutex;
|
||||||
};
|
};
|
||||||
|
|
||||||
void nl_dump_start(struct nl_dump *, int protocol,
|
void nl_dump_start(struct nl_dump *, int protocol,
|
||||||
|
|||||||
Reference in New Issue
Block a user