2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-31 06:15:47 +00:00

netdev-linux: Replace sendmsg with sendmmsg in netdev_linux_send

Sendmmsg can reduce cpu cycles in sending packets to kernel.
Replace sendmsg with sendmmsg in function netdev_linux_send to send
batch packets if sendmmsg is available.

If kernel side doesn't support sendmmsg, will fallback to sendmsg.

    netserver
|------------|
|            |
|  container |
|----veth----|
          |
          |        |------------|
          |---veth-|   dpdk-ovs |      netperf
                   |            |  |--------------|
                   |----dpdk----|  | bare-metal   |
                         |         |--------------|
                         |              |
                         |              |
                        pnic-----------pnic

Netperf was consumed to test the performance:

1)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 1400
result: netserver received 2383.21Mb(sendmsg)/2551.64Mb(sendmmsg)

2)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 60
result: netserver received 109.72Mb(sendmsg)/115.18Mb(sendmmsg)

Sendmmsg show about 6% improvement in netperf UDP testing.

Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
This commit is contained in:
Zhenyu Gao
2017-08-02 14:58:24 -07:00
committed by Ben Pfaff
parent a61a289119
commit d19cf8bb79

View File

@@ -1182,13 +1182,89 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
}
}
/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
static int
netdev_linux_sock_batch_send(int sock, int ifindex,
struct dp_packet_batch *batch)
{
/* We don't bother setting most fields in sockaddr_ll because the
* kernel ignores them for SOCK_RAW. */
struct sockaddr_ll sll = { .sll_family = AF_PACKET,
.sll_ifindex = ifindex };
struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * batch->count);
struct iovec *iov = xmalloc(sizeof(*iov) * batch->count);
for (int i = 0; i < batch->count; i++) {
struct dp_packet *packet = batch->packets[i];
iov[i].iov_base = dp_packet_data(packet);
iov[i].iov_len = dp_packet_get_send_len(packet);
mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
.msg_namelen = sizeof sll,
.msg_iov = &iov[i],
.msg_iovlen = 1 };
}
int error = 0;
for (uint32_t ofs = 0; ofs < batch->count; ) {
ssize_t retval;
do {
retval = sendmmsg(sock, mmsg + ofs, batch->count - ofs, 0);
error = retval < 0 ? errno : 0;
} while (error == EINTR);
if (error) {
break;
}
ofs += retval;
}
free(mmsg);
free(iov);
return error;
}
/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
* essential, because packets sent to a tap device with an AF_PACKET socket
* will loop back to be *received* again on the tap device. This doesn't occur
* on other interface types because we attach a socket filter to the rx
* socket. */
static int
netdev_linux_tap_batch_send(struct netdev *netdev_,
struct dp_packet_batch *batch)
{
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
for (int i = 0; i < batch->count; i++) {
struct dp_packet *packet = batch->packets[i];
size_t size = dp_packet_get_send_len(packet);
ssize_t retval;
int error;
do {
retval = write(netdev->tap_fd, dp_packet_data(packet), size);
error = retval < 0 ? errno : 0;
} while (error == EINTR);
if (error) {
/* The Linux tap driver returns EIO if the device is not up. From
* the OVS side this is not an error, so we ignore it; otherwise,
* return the erro. */
if (error != EIO) {
return error;
}
} else if (retval != size) {
VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
"bytes of %"PRIuSIZE") on %s",
retval, size, netdev_get_name(netdev_));
return EMSGSIZE;
}
}
return 0;
}
/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
* errno value. Returns EAGAIN without blocking if the packet cannot be queued
* immediately. Returns EMSGSIZE if a partial packet was transmitted or if
* the packet is too big or too small to transmit on the device.
*
* The caller retains ownership of 'buffer' in all cases.
*
* The kernel maintains a packet transmission queue, so the caller is not
* expected to do additional queuing of packets. */
static int
@@ -1199,8 +1275,6 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
int error = 0;
int sock = 0;
struct sockaddr_ll sll;
struct msghdr msg;
if (!is_tap_netdev(netdev_)) {
sock = af_packet_sock();
if (sock < 0) {
@@ -1214,84 +1288,25 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
goto free_batch;
}
/* We don't bother setting most fields in sockaddr_ll because the
* kernel ignores them for SOCK_RAW. */
memset(&sll, 0, sizeof sll);
sll.sll_family = AF_PACKET;
sll.sll_ifindex = ifindex;
msg.msg_name = &sll;
msg.msg_namelen = sizeof sll;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
error = netdev_linux_sock_batch_send(sock, ifindex, batch);
} else {
error = netdev_linux_tap_batch_send(netdev_, batch);
}
/* 'i' is incremented only if there's no error */
for (int i = 0; i < batch->count; ) {
const void *data = dp_packet_data(batch->packets[i]);
size_t size = dp_packet_get_send_len(batch->packets[i]);
ssize_t retval;
if (!is_tap_netdev(netdev_)) {
/* Use our AF_PACKET socket to send to this device. */
struct iovec iov;
iov.iov_base = CONST_CAST(void *, data);
iov.iov_len = size;
msg.msg_iov = &iov;
retval = sendmsg(sock, &msg, 0);
if (error) {
if (error == ENOBUFS) {
/* The Linux AF_PACKET implementation never blocks waiting
* for room for packets, instead returning ENOBUFS.
* Translate this into EAGAIN for the caller. */
error = EAGAIN;
} else {
/* Use the tap fd to send to this device. This is essential for
* tap devices, because packets sent to a tap device with an
* AF_PACKET socket will loop back to be *received* again on the
* tap device. This doesn't occur on other interface types
* because we attach a socket filter to the rx socket. */
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
retval = write(netdev->tap_fd, data, size);
}
if (retval < 0) {
if (errno == EINTR) {
/* The send was interrupted by a signal. Retry the packet by
* continuing without incrementing 'i'.*/
continue;
} else if (errno == EIO && is_tap_netdev(netdev_)) {
/* The Linux tap driver returns EIO if the device is not up.
* From the OVS side this is not an error, so ignore it. */
} else {
/* The Linux AF_PACKET implementation never blocks waiting for
* room for packets, instead returning ENOBUFS. Translate this
* into EAGAIN for the caller. */
error = errno == ENOBUFS ? EAGAIN : errno;
break;
}
} else if (retval != size) {
VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
" of %"PRIuSIZE") on %s", retval, size,
netdev_get_name(netdev_));
error = EMSGSIZE;
break;
}
/* Process the next packet in the batch */
i++;
}
if (error && error != EAGAIN) {
VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
netdev_get_name(netdev_), ovs_strerror(error));
}
}
free_batch:
dp_packet_delete_batch(batch, may_steal);
return error;
}
/* Registers with the poll loop to wake up from the next call to poll_block()