2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00
criu/soccr/soccr.c
Michał Mirosław 804c0ba820 soccr: Log name of socket queue that failed to restore.
Signed-off-by: Michał Mirosław <emmir@google.com>
2023-10-22 13:29:25 -07:00

931 lines
22 KiB
C

#include <errno.h>
#include <libnet.h>
#include <linux/sockios.h>
#include <linux/types.h>
#include <netinet/tcp.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include "soccr.h"
#ifndef SIOCOUTQNSD
/* MAO - Define SIOCOUTQNSD ioctl if we don't have it */
#define SIOCOUTQNSD 0x894B
#endif
enum {
TCPF_ESTABLISHED = (1 << 1),
TCPF_SYN_SENT = (1 << 2),
TCPF_SYN_RECV = (1 << 3),
TCPF_FIN_WAIT1 = (1 << 4),
TCPF_FIN_WAIT2 = (1 << 5),
TCPF_TIME_WAIT = (1 << 6),
TCPF_CLOSE = (1 << 7),
TCPF_CLOSE_WAIT = (1 << 8),
TCPF_LAST_ACK = (1 << 9),
TCPF_LISTEN = (1 << 10),
TCPF_CLOSING = (1 << 11),
};
/*
* The TCP transition diagram for half closed connections
*
* ------------
* FIN_WAIT1 \ FIN
* ---------
* / ACK CLOSE_WAIT
* -----------
* FIN_WAIT2
* ----------
* / FIN LAST_ACK
* -----------
* TIME_WAIT \ ACK
* ----------
* CLOSED
*
* How to get the TCP_CLOSING state
*
* ----------- ----------
* FIN_WAIT1 \/ FIN FIN_WAIT1
* ----------- ----------
* CLOSING CLOSING
* \/ ACK
* ----------- ----------
* TIME_WAIT TIME_WAIT
*/
/* Restore a fin packet in a send queue first */
#define SNDQ_FIRST_FIN (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING)
/* Restore fin in a send queue after restoring fi in the receive queue. */
#define SNDQ_SECOND_FIN (TCPF_LAST_ACK | TCPF_CLOSE)
#define SNDQ_FIN_ACKED (TCPF_FIN_WAIT2 | TCPF_CLOSE)
#define RCVQ_FIRST_FIN (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSE)
#define RCVQ_SECOND_FIN (TCPF_CLOSING)
#define RCVQ_FIN_ACKED (TCPF_CLOSE)
static void (*log)(unsigned int loglevel, const char *format, ...) __attribute__((__format__(__printf__, 2, 3)));
static unsigned int log_level = 0;
void libsoccr_set_log(unsigned int level, void (*fn)(unsigned int level, const char *fmt, ...))
{
log_level = level;
log = fn;
}
#define loge(msg, ...) \
do { \
if (log && (log_level >= SOCCR_LOG_ERR)) \
log(SOCCR_LOG_ERR, "Error (%s:%d): " msg, __FILE__, __LINE__, ##__VA_ARGS__); \
} while (0)
#define logerr(msg, ...) loge(msg ": %s\n", ##__VA_ARGS__, strerror(errno))
#define logd(msg, ...) \
do { \
if (log && (log_level >= SOCCR_LOG_DBG)) \
log(SOCCR_LOG_DBG, "Debug: " msg, ##__VA_ARGS__); \
} while (0)
static int tcp_repair_on(int fd)
{
int ret, aux = 1;
ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux));
if (ret < 0)
logerr("Can't turn TCP repair mode ON");
return ret;
}
static int tcp_repair_off(int fd)
{
int aux = 0, ret;
ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux));
if (ret < 0)
logerr("Failed to turn off repair mode on socket");
return ret;
}
struct libsoccr_sk {
int fd;
unsigned flags;
char *recv_queue;
char *send_queue;
union libsoccr_addr *src_addr;
union libsoccr_addr *dst_addr;
};
#define SK_FLAG_FREE_RQ 0x1
#define SK_FLAG_FREE_SQ 0x2
#define SK_FLAG_FREE_SA 0x4
#define SK_FLAG_FREE_DA 0x8
struct libsoccr_sk *libsoccr_pause(int fd)
{
struct libsoccr_sk *ret;
ret = malloc(sizeof(*ret));
if (!ret) {
loge("Unable to allocate memory\n");
return NULL;
}
if (tcp_repair_on(fd) < 0) {
free(ret);
return NULL;
}
ret->flags = 0;
ret->recv_queue = NULL;
ret->send_queue = NULL;
ret->src_addr = NULL;
ret->dst_addr = NULL;
ret->fd = fd;
return ret;
}
void libsoccr_resume(struct libsoccr_sk *sk)
{
tcp_repair_off(sk->fd);
libsoccr_release(sk);
}
void libsoccr_release(struct libsoccr_sk *sk)
{
if (sk->flags & SK_FLAG_FREE_RQ)
free(sk->recv_queue);
if (sk->flags & SK_FLAG_FREE_SQ)
free(sk->send_queue);
if (sk->flags & SK_FLAG_FREE_SA)
free(sk->src_addr);
if (sk->flags & SK_FLAG_FREE_DA)
free(sk->dst_addr);
free(sk);
}
struct soccr_tcp_info {
__u8 tcpi_state;
__u8 tcpi_ca_state;
__u8 tcpi_retransmits;
__u8 tcpi_probes;
__u8 tcpi_backoff;
__u8 tcpi_options;
__u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
};
static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, struct soccr_tcp_info *ti)
{
int size;
socklen_t olen = sizeof(*ti);
if (getsockopt(sk->fd, SOL_TCP, TCP_INFO, ti, &olen) || olen != sizeof(*ti)) {
logerr("Failed to obtain TCP_INFO");
return -1;
}
switch (ti->tcpi_state) {
case TCP_ESTABLISHED:
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
case TCP_LAST_ACK:
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_CLOSE:
case TCP_SYN_SENT:
break;
default:
loge("Unknown state %d\n", ti->tcpi_state);
return -1;
}
data->state = ti->tcpi_state;
if (ioctl(sk->fd, SIOCOUTQ, &size) == -1) {
logerr("Unable to get size of snd queue");
return -1;
}
data->outq_len = size;
if (ioctl(sk->fd, SIOCOUTQNSD, &size) == -1) {
logerr("Unable to get size of unsent data");
return -1;
}
data->unsq_len = size;
if (data->state == TCP_CLOSE) {
/* A connection could be reset. In this case a sent queue
* may contain some data. A user can't read this data, so let's
* ignore them. Otherwise we will need to add a logic whether
* the send queue contains a fin packet or not and decide whether
* a fin or reset packet has to be sent to restore a state
*/
data->unsq_len = 0;
data->outq_len = 0;
}
/* Don't account the fin packet. It doesn't contain real data. */
if ((1 << data->state) & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) {
if (data->outq_len)
data->outq_len--;
data->unsq_len = data->unsq_len ? data->unsq_len - 1 : 0;
}
if (ioctl(sk->fd, SIOCINQ, &size) == -1) {
logerr("Unable to get size of recv queue");
return -1;
}
data->inq_len = size;
return 0;
}
static int get_stream_options(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, struct soccr_tcp_info *ti)
{
int ret;
socklen_t auxl;
int val;
auxl = sizeof(data->mss_clamp);
ret = getsockopt(sk->fd, SOL_TCP, TCP_MAXSEG, &data->mss_clamp, &auxl);
if (ret < 0)
goto err_sopt;
data->opt_mask = ti->tcpi_options;
if (ti->tcpi_options & TCPI_OPT_WSCALE) {
data->snd_wscale = ti->tcpi_snd_wscale;
data->rcv_wscale = ti->tcpi_rcv_wscale;
}
if (ti->tcpi_options & TCPI_OPT_TIMESTAMPS) {
auxl = sizeof(val);
ret = getsockopt(sk->fd, SOL_TCP, TCP_TIMESTAMP, &val, &auxl);
if (ret < 0)
goto err_sopt;
data->timestamp = val;
}
return 0;
err_sopt:
logerr("\tsockopt failed");
return -1;
}
static int get_window(struct libsoccr_sk *sk, struct libsoccr_sk_data *data)
{
struct tcp_repair_window opt;
socklen_t optlen = sizeof(opt);
if (getsockopt(sk->fd, SOL_TCP, TCP_REPAIR_WINDOW, &opt, &optlen)) {
/* Appeared since 4.8, but TCP_repair itself is since 3.11 */
if (errno == ENOPROTOOPT)
return 0;
logerr("Unable to get window properties");
return -1;
}
data->flags |= SOCCR_FLAGS_WINDOW;
data->snd_wl1 = opt.snd_wl1;
data->snd_wnd = opt.snd_wnd;
data->max_window = opt.max_window;
data->rcv_wnd = opt.rcv_wnd;
data->rcv_wup = opt.rcv_wup;
return 0;
}
/*
* TCP queues sequences and their relations to the code below
*
* output queue
* net <----------------------------- sk
* ^ ^ ^ seq >>
* snd_una snd_nxt write_seq
*
* input queue
* net -----------------------------> sk
* << seq ^ ^
* rcv_nxt copied_seq
*
*
* inq_len = rcv_nxt - copied_seq = SIOCINQ
* outq_len = write_seq - snd_una = SIOCOUTQ
* inq_seq = rcv_nxt
* outq_seq = write_seq
*
* On restore kernel moves the option we configure with setsockopt,
* thus we should advance them on the _len value in restore_tcp_seqs.
*
*/
static int get_queue(int sk, int queue_id, __u32 *seq, __u32 len, char **bufp)
{
int ret, aux;
socklen_t auxl;
char *buf;
aux = queue_id;
auxl = sizeof(aux);
ret = setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &aux, auxl);
if (ret < 0)
goto err_sopt;
auxl = sizeof(*seq);
ret = getsockopt(sk, SOL_TCP, TCP_QUEUE_SEQ, seq, &auxl);
if (ret < 0)
goto err_sopt;
if (len) {
/*
* Try to grab one byte more from the queue to
* make sure there are len bytes for real
*/
buf = malloc(len + 1);
if (!buf) {
loge("Unable to allocate memory\n");
goto err_buf;
}
ret = recv(sk, buf, len + 1, MSG_PEEK | MSG_DONTWAIT);
if (ret != len)
goto err_recv;
} else
buf = NULL;
*bufp = buf;
return 0;
err_sopt:
logerr("\tsockopt failed");
err_buf:
return -1;
err_recv:
logerr("\trecv failed (%d, want %d)", ret, len);
free(buf);
goto err_buf;
}
/*
* This is how much data we've had in the initial libsoccr
*/
#define SOCR_DATA_MIN_SIZE (17 * sizeof(__u32))
int libsoccr_save(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size)
{
struct soccr_tcp_info ti;
if (!data || data_size < SOCR_DATA_MIN_SIZE) {
loge("Invalid input parameters\n");
return -1;
}
memset(data, 0, data_size);
if (refresh_sk(sk, data, &ti))
return -2;
if (get_stream_options(sk, data, &ti))
return -3;
if (get_window(sk, data))
return -4;
sk->flags |= SK_FLAG_FREE_SQ | SK_FLAG_FREE_RQ;
if (get_queue(sk->fd, TCP_RECV_QUEUE, &data->inq_seq, data->inq_len, &sk->recv_queue))
return -5;
if (get_queue(sk->fd, TCP_SEND_QUEUE, &data->outq_seq, data->outq_len, &sk->send_queue))
return -6;
return sizeof(struct libsoccr_sk_data);
}
#define GET_Q_FLAGS (SOCCR_MEM_EXCL)
char *libsoccr_get_queue_bytes(struct libsoccr_sk *sk, int queue_id, unsigned flags)
{
char **p, *ret;
if (flags & ~GET_Q_FLAGS)
return NULL;
switch (queue_id) {
case TCP_RECV_QUEUE:
p = &sk->recv_queue;
break;
case TCP_SEND_QUEUE:
p = &sk->send_queue;
break;
default:
return NULL;
}
ret = *p;
if (flags & SOCCR_MEM_EXCL)
*p = NULL;
return ret;
}
#define GET_SA_FLAGS (SOCCR_MEM_EXCL)
union libsoccr_addr *libsoccr_get_addr(struct libsoccr_sk *sk, int self, unsigned flags)
{
if (flags & ~GET_SA_FLAGS)
return NULL;
/* FIXME -- implemented in CRIU, makes sense to have it here too */
return NULL;
}
static int set_queue_seq(struct libsoccr_sk *sk, int queue, __u32 seq)
{
logd("\tSetting %d queue seq to %u\n", queue, seq);
if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
logerr("Can't set repair queue");
return -1;
}
if (setsockopt(sk->fd, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)) < 0) {
logerr("Can't set queue seq");
return -1;
}
return 0;
}
#ifndef TCPOPT_SACK_PERM
#define TCPOPT_SACK_PERM TCPOPT_SACK_PERMITTED
#endif
static int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size)
{
struct tcp_repair_opt opts[4];
int addr_size, mstate;
int onr = 0;
__u32 seq;
if (!data || data_size < SOCR_DATA_MIN_SIZE) {
loge("Invalid input parameters\n");
return -1;
}
if (!sk->dst_addr || !sk->src_addr) {
loge("Destination or/and source addresses aren't set\n");
return -1;
}
mstate = 1 << data->state;
if (data->state == TCP_LISTEN) {
loge("Unable to handle listen sockets\n");
return -1;
}
if (sk->src_addr->sa.sa_family == AF_INET)
addr_size = sizeof(sk->src_addr->v4);
else
addr_size = sizeof(sk->src_addr->v6);
if (bind(sk->fd, &sk->src_addr->sa, addr_size)) {
logerr("Can't bind inet socket back");
return -1;
}
if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN))
data->inq_seq--;
/* outq_seq is adjusted due to not accounting the fin packet */
if (mstate & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN))
data->outq_seq--;
if (set_queue_seq(sk, TCP_RECV_QUEUE, data->inq_seq - data->inq_len))
return -2;
seq = data->outq_seq - data->outq_len;
if (data->state == TCP_SYN_SENT)
seq--;
if (set_queue_seq(sk, TCP_SEND_QUEUE, seq))
return -3;
if (sk->dst_addr->sa.sa_family == AF_INET)
addr_size = sizeof(sk->dst_addr->v4);
else
addr_size = sizeof(sk->dst_addr->v6);
if (data->state == TCP_SYN_SENT && tcp_repair_off(sk->fd))
return -1;
if (connect(sk->fd, &sk->dst_addr->sa, addr_size) == -1 && errno != EINPROGRESS) {
logerr("Can't connect inet socket back");
return -1;
}
if (data->state == TCP_SYN_SENT && tcp_repair_on(sk->fd))
return -1;
logd("\tRestoring TCP options\n");
if (data->opt_mask & TCPI_OPT_SACK) {
logd("\t\tWill turn SAK on\n");
opts[onr].opt_code = TCPOPT_SACK_PERM;
opts[onr].opt_val = 0;
onr++;
}
if (data->opt_mask & TCPI_OPT_WSCALE) {
logd("\t\tWill set snd_wscale to %u\n", data->snd_wscale);
logd("\t\tWill set rcv_wscale to %u\n", data->rcv_wscale);
opts[onr].opt_code = TCPOPT_WINDOW;
opts[onr].opt_val = data->snd_wscale + (data->rcv_wscale << 16);
onr++;
}
if (data->opt_mask & TCPI_OPT_TIMESTAMPS) {
logd("\t\tWill turn timestamps on\n");
opts[onr].opt_code = TCPOPT_TIMESTAMP;
opts[onr].opt_val = 0;
onr++;
}
logd("Will set mss clamp to %u\n", data->mss_clamp);
opts[onr].opt_code = TCPOPT_MAXSEG;
opts[onr].opt_val = data->mss_clamp;
onr++;
if (data->state != TCP_SYN_SENT &&
setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_OPTIONS, opts, onr * sizeof(struct tcp_repair_opt)) < 0) {
logerr("Can't repair options");
return -2;
}
if (data->opt_mask & TCPI_OPT_TIMESTAMPS) {
if (setsockopt(sk->fd, SOL_TCP, TCP_TIMESTAMP, &data->timestamp, sizeof(data->timestamp)) < 0) {
logerr("Can't set timestamp");
return -3;
}
}
return 0;
}
/* IPv4-Mapped IPv6 Addresses */
static int ipv6_addr_mapped(union libsoccr_addr *addr)
{
return (addr->v6.sin6_addr.s6_addr32[2] == htonl(0x0000ffff));
}
static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, uint8_t flags)
{
uint32_t src_v4 = sk->src_addr->v4.sin_addr.s_addr;
uint32_t dst_v4 = sk->dst_addr->v4.sin_addr.s_addr;
int ret, exit_code = -1, family;
char errbuf[LIBNET_ERRBUF_SIZE];
int mark = SOCCR_MARK;
int libnet_type;
libnet_t *l;
family = sk->dst_addr->sa.sa_family;
if (family == AF_INET6 && ipv6_addr_mapped(sk->dst_addr)) {
/* TCP over IPv4 */
family = AF_INET;
dst_v4 = sk->dst_addr->v6.sin6_addr.s6_addr32[3];
src_v4 = sk->src_addr->v6.sin6_addr.s6_addr32[3];
}
if (family == AF_INET6)
libnet_type = LIBNET_RAW6;
else
libnet_type = LIBNET_RAW4;
l = libnet_init(libnet_type, /* injection type */
NULL, /* network interface */
errbuf); /* errbuf */
if (l == NULL) {
loge("libnet_init failed (%s)\n", errbuf);
return -1;
}
if (setsockopt(l->fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark))) {
logerr("Can't set SO_MARK (%d) for socket\n", mark);
goto err;
}
ret = libnet_build_tcp(ntohs(sk->dst_addr->v4.sin_port), /* source port */
ntohs(sk->src_addr->v4.sin_port), /* destination port */
data->inq_seq, /* sequence number */
data->outq_seq - data->outq_len, /* acknowledgement num */
flags, /* control flags */
data->rcv_wnd, /* window size */
0, /* checksum */
10, /* urgent pointer */
LIBNET_TCP_H + 20, /* TCP packet size */
NULL, /* payload */
0, /* payload size */
l, /* libnet handle */
0); /* libnet id */
if (ret == -1) {
loge("Can't build TCP header: %s\n", libnet_geterror(l));
goto err;
}
if (family == AF_INET6) {
struct libnet_in6_addr src, dst;
memcpy(&dst, &sk->dst_addr->v6.sin6_addr, sizeof(dst));
memcpy(&src, &sk->src_addr->v6.sin6_addr, sizeof(src));
ret = libnet_build_ipv6(0, 0, LIBNET_TCP_H, /* length */
IPPROTO_TCP, /* protocol */
64, /* hop limit */
dst, /* source IP */
src, /* destination IP */
NULL, /* payload */
0, /* payload size */
l, /* libnet handle */
0); /* libnet id */
} else if (family == AF_INET)
ret = libnet_build_ipv4(LIBNET_IPV4_H + LIBNET_TCP_H + 20, /* length */
0, /* TOS */
242, /* IP ID */
0, /* IP Frag */
64, /* TTL */
IPPROTO_TCP, /* protocol */
0, /* checksum */
dst_v4, /* source IP */
src_v4, /* destination IP */
NULL, /* payload */
0, /* payload size */
l, /* libnet handle */
0); /* libnet id */
else {
loge("Unknown socket family\n");
goto err;
}
if (ret == -1) {
loge("Can't build IP header: %s\n", libnet_geterror(l));
goto err;
}
ret = libnet_write(l);
if (ret == -1) {
loge("Unable to send a fin packet: %s\n", libnet_geterror(l));
goto err;
}
exit_code = 0;
err:
libnet_destroy(l);
return exit_code;
}
static int restore_fin_in_snd_queue(int sk, int acked)
{
int queue = TCP_SEND_QUEUE;
int ret;
/*
* If TCP_SEND_QUEUE is set, a fin packet will be
* restored as a sent packet.
*/
if (acked && setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
logerr("Can't set repair queue");
return -1;
}
ret = shutdown(sk, SHUT_WR);
if (ret < 0)
logerr("Unable to shut down a socket");
queue = TCP_NO_QUEUE;
if (acked && setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
logerr("Can't set repair queue");
return -1;
}
return ret;
}
static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue,
char *buf);
int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size)
{
int mstate = 1 << data->state;
if (libsoccr_set_sk_data_noq(sk, data, data_size))
return -1;
if (libsoccr_restore_queue(sk, data, sizeof(*data), TCP_RECV_QUEUE, sk->recv_queue))
return -1;
if (libsoccr_restore_queue(sk, data, sizeof(*data), TCP_SEND_QUEUE, sk->send_queue))
return -1;
if (data->flags & SOCCR_FLAGS_WINDOW) {
struct tcp_repair_window wopt = {
.snd_wl1 = data->snd_wl1,
.snd_wnd = data->snd_wnd,
.max_window = data->max_window,
.rcv_wnd = data->rcv_wnd,
.rcv_wup = data->rcv_wup,
};
if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) {
wopt.rcv_wup--;
wopt.rcv_wnd++;
}
if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_WINDOW, &wopt, sizeof(wopt))) {
logerr("Unable to set window parameters");
return -1;
}
}
/*
* To restore a half closed sockets, fin packets has to be restored in
* recv and send queues. Here shutdown() is used to restore a fin
* packet in the send queue and a fake fin packet is send to restore it
* in the recv queue.
*/
if (mstate & SNDQ_FIRST_FIN)
restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED);
/* Send a fin packet to the socket to restore it in a receive queue. */
if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN))
if (send_fin(sk, data, data_size, TH_ACK | TH_FIN) < 0)
return -1;
if (mstate & SNDQ_SECOND_FIN)
restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED);
if (mstate & RCVQ_FIN_ACKED)
data->inq_seq++;
if (mstate & SNDQ_FIN_ACKED) {
data->outq_seq++;
if (send_fin(sk, data, data_size, TH_ACK) < 0)
return -1;
}
return 0;
}
static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __u32 len)
{
int ret, err = -1, max_chunk;
int off;
max_chunk = len;
off = 0;
do {
int chunk = len;
if (chunk > max_chunk)
chunk = max_chunk;
ret = send(sk->fd, buf + off, chunk, 0);
if (ret <= 0) {
if (max_chunk > 1024) {
/*
* Kernel not only refuses the whole chunk,
* but refuses to split it into pieces too.
*
* When restoring recv queue in repair mode
* kernel doesn't try hard and just allocates
* a linear skb with the size we pass to the
* system call. Thus, if the size is too big
* for slab allocator, the send just fails
* with ENOMEM.
*
* In any case -- try smaller chunk, hopefully
* there's still enough memory in the system.
*/
max_chunk >>= 1;
continue;
}
logerr("Can't restore %s queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk);
goto err;
}
off += ret;
len -= ret;
} while (len);
err = 0;
err:
return err;
}
static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len)
{
logd("\tRestoring TCP %d queue data %u bytes\n", queue, len);
if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
logerr("Can't set repair queue");
return -1;
}
return __send_queue(sk, queue == TCP_RECV_QUEUE ? "recv" : "send", buf, len);
}
static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue,
char *buf)
{
if (!buf)
return 0;
if (!data || data_size < SOCR_DATA_MIN_SIZE)
return -1;
if (queue == TCP_RECV_QUEUE) {
if (!data->inq_len)
return 0;
return send_queue(sk, TCP_RECV_QUEUE, buf, data->inq_len);
}
if (queue == TCP_SEND_QUEUE) {
__u32 len, ulen;
/*
* All data in a write buffer can be divided on two parts sent
* but not yet acknowledged data and unsent data.
* The TCP stack must know which data have been sent, because
* acknowledgment can be received for them. These data must be
* restored in repair mode.
*/
ulen = data->unsq_len;
len = data->outq_len - ulen;
if (len && send_queue(sk, TCP_SEND_QUEUE, buf, len))
return -2;
if (ulen) {
/*
* The second part of data have never been sent to outside, so
* they can be restored without any tricks.
*/
tcp_repair_off(sk->fd);
if (__send_queue(sk, "not-sent send", buf + len, ulen))
return -3;
if (tcp_repair_on(sk->fd))
return -4;
}
return 0;
}
return -5;
}
#define SET_Q_FLAGS (SOCCR_MEM_EXCL)
int libsoccr_set_queue_bytes(struct libsoccr_sk *sk, int queue_id, char *bytes, unsigned flags)
{
if (flags & ~SET_Q_FLAGS)
return -1;
switch (queue_id) {
case TCP_RECV_QUEUE:
sk->recv_queue = bytes;
if (flags & SOCCR_MEM_EXCL)
sk->flags |= SK_FLAG_FREE_RQ;
return 0;
case TCP_SEND_QUEUE:
sk->send_queue = bytes;
if (flags & SOCCR_MEM_EXCL)
sk->flags |= SK_FLAG_FREE_SQ;
return 0;
}
return -1;
}
#define SET_SA_FLAGS (SOCCR_MEM_EXCL)
int libsoccr_set_addr(struct libsoccr_sk *sk, int self, union libsoccr_addr *addr, unsigned flags)
{
if (flags & ~SET_SA_FLAGS)
return -1;
if (self) {
sk->src_addr = addr;
if (flags & SOCCR_MEM_EXCL)
sk->flags |= SK_FLAG_FREE_SA;
} else {
sk->dst_addr = addr;
if (flags & SOCCR_MEM_EXCL)
sk->flags |= SK_FLAG_FREE_DA;
}
return 0;
}