2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00
criu/net.c
Cyrill Gorcunov 2467a30fe2 net: Treat venet device as a separate complete device
When restoring venet device we need to restore its
index as well, which actually possible with new iproute2
package but the problem is that the index itself lays
inside image file. We could use crit tool to extract
it but this would slowdon procedure signifantly (need
to run python which would parse the image, or need
to pass the index into environmnet from inside of
the CRIU itself).

So lets do a trick and simply created venet device
inside container by criu itself (thanks we support
creating venet via netlink interface now).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrew Vagin <avagin@odin.com>
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
2015-10-28 10:23:12 +03:00

973 lines
21 KiB
C

#include <unistd.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <string.h>
#include <net/if_arp.h>
#include <sys/wait.h>
#include <sched.h>
#include <sys/mount.h>
#include <net/if.h>
#include <linux/sockios.h>
#include "imgset.h"
#include "syscall-types.h"
#include "namespaces.h"
#include "net.h"
#include "libnetlink.h"
#include "cr_options.h"
#include "sk-inet.h"
#include "tun.h"
#include "util-pie.h"
#include "plugin.h"
#include "action-scripts.h"
#include "sockets.h"
#include "pstree.h"
#include "sysctl.h"
#include "protobuf.h"
#include "protobuf/netdev.pb-c.h"
static int ns_fd = -1;
static int ns_sysfs_fd = -1;
int read_ns_sys_file(char *path, char *buf, int len)
{
int fd, rlen;
BUG_ON(ns_sysfs_fd == -1);
fd = openat(ns_sysfs_fd, path, O_RDONLY, 0);
if (fd < 0) {
pr_perror("Can't open ns' %s", path);
return -1;
}
rlen = read(fd, buf, len);
close(fd);
if (rlen >= 0)
buf[rlen] = '\0';
return rlen;
}
static char *devconfs[] = {
"accept_local",
"accept_redirects",
"accept_source_route",
"arp_accept",
"arp_announce",
"arp_filter",
"arp_ignore",
"arp_notify",
"bootp_relay",
"disable_policy",
"disable_xfrm",
"force_igmp_version",
"forwarding",
"igmpv2_unsolicited_report_interval",
"igmpv3_unsolicited_report_interval",
"log_martians",
"medium_id",
"promote_secondaries",
"proxy_arp",
"proxy_arp_pvlan",
"route_localnet",
"rp_filter",
"secure_redirects",
"send_redirects",
"shared_media",
"src_valid_mark",
"tag",
};
/*
* I case if some entry is missing in
* the kernel, simply write DEVCONFS_UNUSED
* into the image so we would skip it.
*/
#define DEVCONFS_UNUSED (-1u)
#define NET_CONF_PATH "net/ipv4/conf"
#define MAX_CONF_OPT_PATH IFNAMSIZ+50
static int ipv4_conf_op(char *tgt, int *conf, int op, NetnsEntry **netns)
{
int i, ri;
int ret, flags = op == CTL_READ ? CTL_FLAGS_OPTIONAL : 0;
struct sysctl_req req[ARRAY_SIZE(devconfs)];
char path[ARRAY_SIZE(devconfs)][MAX_CONF_OPT_PATH];
for (i = 0, ri = 0; i < ARRAY_SIZE(devconfs); i++) {
/*
* If dev conf value is the same as default skip restoring it
*/
if (netns && conf[i] == (*netns)->def_conf[i]) {
pr_debug("DEBUG Skip %s/%s, val =%d\n", tgt, devconfs[i], conf[i]);
continue;
}
if (op == CTL_WRITE && conf[i] == DEVCONFS_UNUSED)
continue;
else if (op == CTL_READ)
conf[i] = DEVCONFS_UNUSED;
snprintf(path[i], MAX_CONF_OPT_PATH, "%s/%s/%s", NET_CONF_PATH, tgt, devconfs[i]);
req[ri].name = path[i];
req[ri].arg = &conf[i];
req[ri].type = CTL_32;
req[ri].flags = flags;
ri++;
}
ret = sysctl_op(req, ri, op);
if (ret < 0) {
pr_err("Failed to %s %s/<confs>\n", (op == CTL_READ)?"read":"write", tgt);
return -1;
}
return 0;
}
int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds)
{
return pb_write_one(img_from_set(fds, CR_FD_NETDEV), nde, PB_NETDEV);
}
static int dump_one_netdev(int type, struct ifinfomsg *ifi,
struct rtattr **tb, struct cr_imgset *fds,
int (*dump)(NetDeviceEntry *, struct cr_imgset *))
{
int ret;
NetDeviceEntry netdev = NET_DEVICE_ENTRY__INIT;
if (!tb[IFLA_IFNAME]) {
pr_err("No name for link %d\n", ifi->ifi_index);
return -1;
}
netdev.type = type;
netdev.ifindex = ifi->ifi_index;
netdev.mtu = *(int *)RTA_DATA(tb[IFLA_MTU]);
netdev.flags = ifi->ifi_flags;
netdev.name = RTA_DATA(tb[IFLA_IFNAME]);
if (tb[IFLA_ADDRESS] && (type != ND_TYPE__LOOPBACK)) {
netdev.has_address = true;
netdev.address.data = RTA_DATA(tb[IFLA_ADDRESS]);
netdev.address.len = RTA_PAYLOAD(tb[IFLA_ADDRESS]);
pr_info("Found ll addr (%02x:../%d) for %s\n",
(int)netdev.address.data[0],
(int)netdev.address.len, netdev.name);
}
netdev.n_conf = ARRAY_SIZE(devconfs);
netdev.conf = xmalloc(sizeof(int) * netdev.n_conf);
if (!netdev.conf)
return -1;
ret = ipv4_conf_op(netdev.name, netdev.conf, CTL_READ, NULL);
if (ret < 0)
goto err_free;
if (!dump)
dump = write_netdev_img;
ret = dump(&netdev, fds);
err_free:
xfree(netdev.conf);
return ret;
}
static char *link_kind(struct ifinfomsg *ifi, struct rtattr **tb)
{
struct rtattr *linkinfo[IFLA_INFO_MAX + 1];
if (!tb[IFLA_LINKINFO]) {
pr_err("No linkinfo for eth link %d\n", ifi->ifi_index);
return NULL;
}
parse_rtattr_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
if (!linkinfo[IFLA_INFO_KIND]) {
pr_err("No kind for eth link %d\n", ifi->ifi_index);
return NULL;
}
return RTA_DATA(linkinfo[IFLA_INFO_KIND]);
}
static int dump_unknown_device(struct ifinfomsg *ifi, char *kind,
struct rtattr **tb, struct cr_imgset *fds)
{
int ret;
ret = run_plugins(DUMP_EXT_LINK, ifi->ifi_index, ifi->ifi_type, kind);
if (ret == 0)
return dump_one_netdev(ND_TYPE__EXTLINK, ifi, tb, fds, NULL);
if (ret == -ENOTSUP)
pr_err("Unsupported link %d (type %d kind %s)\n",
ifi->ifi_index, ifi->ifi_type, kind);
return -1;
}
static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
struct rtattr **tb, struct cr_imgset *fds)
{
if (!strcmp(kind, "veth"))
/*
* This is not correct. The peer of the veth device may
* be either outside or inside the netns we're working
* on, but there's currently no way of finding this out.
*
* Sigh... we have to assume, that the veth device is a
* connection to the outer world and just dump this end :(
*/
return dump_one_netdev(ND_TYPE__VETH, ifi, tb, fds, NULL);
if (!strcmp(kind, "tun"))
return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link);
return dump_unknown_device(ifi, kind, tb, fds);
}
static int dump_one_gendev(struct ifinfomsg *ifi, char *kind,
struct rtattr **tb, struct cr_imgset *fds)
{
if (!strcmp(kind, "tun"))
return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link);
return dump_unknown_device(ifi, kind, tb, fds);
}
static int dump_one_voiddev(struct ifinfomsg *ifi, char *kind,
struct rtattr **tb, struct cr_imgset *fds)
{
if (!strcmp(kind, "venet"))
return dump_one_netdev(ND_TYPE__VENET, ifi, tb, fds, NULL);
return dump_unknown_device(ifi, kind, tb, fds);
}
static int dump_one_link(struct nlmsghdr *hdr, void *arg)
{
struct cr_imgset *fds = arg;
struct ifinfomsg *ifi;
int ret = 0, len = hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
struct rtattr *tb[IFLA_MAX + 1];
char *kind;
ifi = NLMSG_DATA(hdr);
if (len < 0) {
pr_err("No iflas for link %d\n", ifi->ifi_index);
return -1;
}
parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
pr_info("\tLD: Got link %d, type %d\n", ifi->ifi_index, ifi->ifi_type);
if (ifi->ifi_type == ARPHRD_LOOPBACK)
return dump_one_netdev(ND_TYPE__LOOPBACK, ifi, tb, fds, NULL);
kind = link_kind(ifi, tb);
if (!kind)
goto unk;
switch (ifi->ifi_type) {
case ARPHRD_ETHER:
ret = dump_one_ethernet(ifi, kind, tb, fds);
break;
case ARPHRD_NONE:
ret = dump_one_gendev(ifi, kind, tb, fds);
break;
case ARPHRD_VOID:
ret = dump_one_voiddev(ifi, kind, tb, fds);
break;
default:
unk:
ret = dump_unknown_device(ifi, kind, tb, fds);
break;
}
return ret;
}
static int dump_links(struct cr_imgset *fds)
{
int sk, ret;
struct {
struct nlmsghdr nlh;
struct rtgenmsg g;
} req;
pr_info("Dumping netns links\n");
ret = sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sk < 0) {
pr_perror("Can't open rtnl sock for net dump");
goto out;
}
memset(&req, 0, sizeof(req));
req.nlh.nlmsg_len = sizeof(req);
req.nlh.nlmsg_type = RTM_GETLINK;
req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
req.nlh.nlmsg_pid = 0;
req.nlh.nlmsg_seq = CR_NLMSG_SEQ;
req.g.rtgen_family = AF_PACKET;
ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_link, NULL, fds);
close(sk);
out:
return ret;
}
static int restore_link_cb(struct nlmsghdr *hdr, void *arg)
{
pr_info("Got response on SETLINK =)\n");
return 0;
}
struct newlink_req {
struct nlmsghdr h;
struct ifinfomsg i;
char buf[1024];
};
static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk,
int (*link_info)(NetDeviceEntry *, struct newlink_req *))
{
struct newlink_req req;
memset(&req, 0, sizeof(req));
req.h.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.h.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE;
req.h.nlmsg_type = msg_type;
req.h.nlmsg_seq = CR_NLMSG_SEQ;
req.i.ifi_family = AF_PACKET;
/*
* SETLINK is called for external devices which may
* have ifindex changed. Thus configure them by their
* name only.
*/
if (msg_type == RTM_NEWLINK)
req.i.ifi_index = nde->ifindex;
req.i.ifi_flags = nde->flags;
addattr_l(&req.h, sizeof(req), IFLA_IFNAME, nde->name, strlen(nde->name));
addattr_l(&req.h, sizeof(req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu));
if (nde->has_address) {
pr_debug("Restore ll addr (%02x:../%d) for device\n",
(int)nde->address.data[0], (int)nde->address.len);
addattr_l(&req.h, sizeof(req), IFLA_ADDRESS,
nde->address.data, nde->address.len);
}
if (link_info) {
struct rtattr *linkinfo;
int ret;
linkinfo = NLMSG_TAIL(&req.h);
addattr_l(&req.h, sizeof(req), IFLA_LINKINFO, NULL, 0);
ret = link_info(nde, &req);
if (ret < 0)
return ret;
linkinfo->rta_len = (void *)NLMSG_TAIL(&req.h) - (void *)linkinfo;
}
return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL);
}
int restore_link_parms(NetDeviceEntry *nde, int nlsk)
{
return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL);
}
static int restore_one_link(NetDeviceEntry *nde, int nlsk,
int (*link_info)(NetDeviceEntry *, struct newlink_req *))
{
pr_info("Restoring netdev %s idx %d\n", nde->name, nde->ifindex);
return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info);
}
#ifndef VETH_INFO_MAX
enum {
VETH_INFO_UNSPEC,
VETH_INFO_PEER,
__VETH_INFO_MAX
#define VETH_INFO_MAX (__VETH_INFO_MAX - 1)
};
#endif
#if IFLA_MAX <= 28
#define IFLA_NET_NS_FD 28
#endif
static int veth_link_info(NetDeviceEntry *nde, struct newlink_req *req)
{
struct rtattr *veth_data, *peer_data;
struct ifinfomsg ifm;
struct veth_pair *n;
BUG_ON(ns_fd < 0);
addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "veth", 4);
veth_data = NLMSG_TAIL(&req->h);
addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
peer_data = NLMSG_TAIL(&req->h);
memset(&ifm, 0, sizeof(ifm));
addattr_l(&req->h, sizeof(*req), VETH_INFO_PEER, &ifm, sizeof(ifm));
list_for_each_entry(n, &opts.veth_pairs, node) {
if (!strcmp(nde->name, n->inside))
break;
}
if (&n->node != &opts.veth_pairs)
addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, n->outside, strlen(n->outside));
addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd));
peer_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)peer_data;
veth_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)veth_data;
return 0;
}
static int venet_link_info(NetDeviceEntry *nde, struct newlink_req *req)
{
struct rtattr *venet_data;
BUG_ON(ns_fd < 0);
venet_data = NLMSG_TAIL(&req->h);
addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "venet", 5);
addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd));
venet_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)venet_data;
return 0;
}
static int restore_link(NetDeviceEntry *nde, int nlsk)
{
pr_info("Restoring link %s type %d\n", nde->name, nde->type);
switch (nde->type) {
case ND_TYPE__LOOPBACK: /* fallthrough */
case ND_TYPE__EXTLINK: /* see comment in protobuf/netdev.proto */
return restore_link_parms(nde, nlsk);
case ND_TYPE__VENET:
return restore_one_link(nde, nlsk, venet_link_info);
case ND_TYPE__VETH:
return restore_one_link(nde, nlsk, veth_link_info);
case ND_TYPE__TUN:
return restore_one_tun(nde, nlsk);
default:
pr_err("Unsupported link type %d\n", nde->type);
break;
}
return -1;
}
static int restore_links(int pid, NetnsEntry **netns)
{
int nlsk, ret;
struct cr_img *img;
NetDeviceEntry *nde;
img = open_image(CR_FD_NETDEV, O_RSTR, pid);
if (!img)
return -1;
nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (nlsk < 0) {
pr_perror("Can't create nlk socket");
close_image(img);
return -1;
}
while (1) {
ret = pb_read_one_eof(img, &nde, PB_NETDEV);
if (ret <= 0)
break;
ret = restore_link(nde, nlsk);
if (ret) {
pr_err("can not restore link");
goto exit;
}
if (nde->conf)
/*
* optimize restore of devices configuration except lo
* lo is created with namespace and before default is set
* so we cant optimize its restore
*/
ret = ipv4_conf_op(nde->name, nde->conf, CTL_WRITE, nde->type == ND_TYPE__LOOPBACK ? NULL : netns);
exit:
net_device_entry__free_unpacked(nde, NULL);
if (ret)
break;
}
close(nlsk);
close_image(img);
return ret;
}
static int run_ip_tool(char *arg1, char *arg2, int fdin, int fdout)
{
char *ip_tool_cmd;
int ret;
pr_debug("\tRunning ip %s %s\n", arg1, arg2);
ip_tool_cmd = getenv("CR_IP_TOOL");
if (!ip_tool_cmd)
ip_tool_cmd = "ip";
ret = cr_system(fdin, fdout, -1, ip_tool_cmd,
(char *[]) { "ip", arg1, arg2, NULL });
if (ret) {
pr_err("IP tool failed on %s %s\n", arg1, arg2);
return -1;
}
return 0;
}
static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
{
int ret;
char *cmd;
cmd = getenv("CR_IPTABLES");
if (!cmd)
cmd = def_cmd;
pr_debug("\tRunning %s for %s\n", cmd, def_cmd);
ret = cr_system(fdin, fdout, -1, "sh", (char *[]) { "sh", "-c", cmd, NULL });
if (ret)
pr_err("%s failed\n", def_cmd);
return ret;
}
static inline int dump_ifaddr(struct cr_imgset *fds)
{
struct cr_img *img = img_from_set(fds, CR_FD_IFADDR);
return run_ip_tool("addr", "save", -1, img_raw_fd(img));
}
static inline int dump_route(struct cr_imgset *fds)
{
struct cr_img *img = img_from_set(fds, CR_FD_ROUTE);
return run_ip_tool("route", "save", -1, img_raw_fd(img));
}
static inline int dump_iptables(struct cr_imgset *fds)
{
struct cr_img *img = img_from_set(fds, CR_FD_IPTABLES);
return run_iptables_tool("iptables-save", -1, img_raw_fd(img));
}
static int dump_netns_conf(struct cr_imgset *fds)
{
int ret;
NetnsEntry netns = NETNS_ENTRY__INIT;
netns.n_def_conf = ARRAY_SIZE(devconfs);
netns.n_all_conf = ARRAY_SIZE(devconfs);
netns.def_conf = xmalloc(sizeof(int) * netns.n_def_conf);
if (!netns.def_conf)
return -1;
netns.all_conf = xmalloc(sizeof(int) * netns.n_all_conf);
if (!netns.all_conf) {
xfree(netns.def_conf);
return -1;
}
ret = ipv4_conf_op("default", netns.def_conf, CTL_READ, NULL);
if (ret < 0)
goto err_free;
ret = ipv4_conf_op("all", netns.all_conf, CTL_READ, NULL);
if (ret < 0)
goto err_free;
ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS);
err_free:
xfree(netns.def_conf);
xfree(netns.all_conf);
return ret;
}
static int restore_ip_dump(int type, int pid, char *cmd)
{
int ret = -1;
struct cr_img *img;
img = open_image(type, O_RSTR, pid);
if (img) {
ret = run_ip_tool(cmd, "restore", img_raw_fd(img), -1);
close_image(img);
}
return ret;
}
static inline int restore_ifaddr(int pid)
{
return restore_ip_dump(CR_FD_IFADDR, pid, "addr");
}
static inline int restore_route(int pid)
{
return restore_ip_dump(CR_FD_ROUTE, pid, "route");
}
static inline int restore_iptables(int pid)
{
int ret = -1;
struct cr_img *img;
img = open_image(CR_FD_IPTABLES, O_RSTR, pid);
if (img) {
ret = run_iptables_tool("iptables-restore", img_raw_fd(img), -1);
close_image(img);
}
return ret;
}
static int restore_netns_conf(int pid, NetnsEntry **netns)
{
int ret = 0;
struct cr_img *img;
img = open_image(CR_FD_NETNS, O_RSTR, pid);
if (!img)
return -1;
if (empty_image(img))
/* Backward compatibility */
goto out;
ret = pb_read_one(img, netns, PB_NETNS);
if (ret < 0) {
pr_err("Can not read netns object\n");
return -1;
}
ret = ipv4_conf_op("default", (*netns)->def_conf, CTL_WRITE, NULL);
if (!ret)
ret = ipv4_conf_op("all", (*netns)->all_conf, CTL_WRITE, NULL);
out:
close_image(img);
return ret;
}
static int mount_ns_sysfs(void)
{
char sys_mount[] = "crtools-sys.XXXXXX";
BUG_ON(ns_sysfs_fd != -1);
/*
* A new mntns is required to avoid the race between
* open_detach_mount and creating mntns.
*/
if (unshare(CLONE_NEWNS)) {
pr_perror("Can't create new mount namespace");
return -1;
}
if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) {
pr_perror("Can't mark the root mount as private");
return -1;
}
if (mkdtemp(sys_mount) == NULL) {
pr_perror("mkdtemp failed %s", sys_mount);
return -1;
}
/*
* The setns() is called, so we're in proper context,
* no need in pulling the mountpoint from parasite.
*/
pr_info("Mount ns' sysfs in %s\n", sys_mount);
if (mount("sysfs", sys_mount, "sysfs", MS_MGC_VAL, NULL)) {
pr_perror("mount failed");
rmdir(sys_mount);
return -1;
}
ns_sysfs_fd = open_detach_mount(sys_mount);
return ns_sysfs_fd >= 0 ? 0 : -1;
}
int dump_net_ns(int ns_id)
{
struct cr_imgset *fds;
int ret;
fds = cr_imgset_open(ns_id, NETNS, O_DUMP);
if (fds == NULL)
return -1;
ret = mount_ns_sysfs();
if (!ret)
ret = dump_netns_conf(fds);
if (!ret)
ret = dump_links(fds);
if (!ret)
ret = dump_ifaddr(fds);
if (!ret)
ret = dump_route(fds);
if (!ret)
ret = dump_iptables(fds);
close(ns_sysfs_fd);
ns_sysfs_fd = -1;
close_cr_imgset(&fds);
return ret;
}
int prepare_net_ns(int pid)
{
int ret;
NetnsEntry *netns = NULL;
ret = restore_netns_conf(pid, &netns);
if (!ret)
ret = restore_links(pid, &netns);
if (netns)
netns_entry__free_unpacked(netns, NULL);
if (!ret)
ret = restore_ifaddr(pid);
if (!ret)
ret = restore_route(pid);
if (!ret)
ret = restore_iptables(pid);
close(ns_fd);
return ret;
}
int netns_pre_create(void)
{
ns_fd = open("/proc/self/ns/net", O_RDONLY | O_CLOEXEC);
if (ns_fd < 0) {
pr_perror("Can't cache net fd");
return -1;
}
pr_info("Saved netns fd for links restore\n");
return 0;
}
int network_lock(void)
{
pr_info("Lock network\n");
/* Each connection will be locked on dump */
if (!(root_ns_mask & CLONE_NEWNET))
return 0;
return run_scripts(ACT_NET_LOCK);
}
void network_unlock(void)
{
pr_info("Unlock network\n");
cpt_unlock_tcp_connections();
rst_unlock_tcp_connections();
if (root_ns_mask & CLONE_NEWNET)
run_scripts(ACT_NET_UNLOCK);
}
int veth_pair_add(char *in, char *out)
{
char *aux;
struct veth_pair *n;
n = xmalloc(sizeof(*n));
if (n == NULL)
return -1;
n->inside = in;
n->outside = out;
/*
* Does the out string specify a bridge for
* moving the outside end of the veth pair to?
*/
aux = strrchr(out, '@');
if (aux) {
*aux++ = '\0';
n->bridge = aux;
} else {
n->bridge = NULL;
}
list_add(&n->node, &opts.veth_pairs);
if (n->bridge)
pr_debug("Added %s:%s@%s veth map\n", in, out, aux);
else
pr_debug("Added %s:%s veth map\n", in, out);
return 0;
}
/*
* The setns() syscall (called by switch_ns()) can be extremely
* slow. If we call it two or more times from the same task the
* kernel will synchonously go on a very slow routine called
* synchronize_rcu() trying to put a reference on old namespaces.
*
* To avoid doing this more than once we pre-create all the
* needed other-ns sockets in advance.
*/
static int prep_ns_sockets(struct ns_id *ns, bool for_dump)
{
int nsret = -1, ret;
if (ns->pid != getpid()) {
pr_info("Switching to %d's net for collecting sockets\n", ns->pid);
if (switch_ns(ns->pid, &net_ns_desc, &nsret))
return -1;
}
if (for_dump) {
ret = ns->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
if (ret < 0) {
pr_perror("Can't create sock diag socket");
goto err_nl;
}
} else
ns->net.nlsk = -1;
ret = ns->net.seqsk = socket(PF_UNIX, SOCK_SEQPACKET, 0);
if (ret < 0) {
pr_perror("Can't create seqsk for parasite");
goto err_sq;
}
ret = 0;
out:
if (nsret >= 0 && restore_ns(nsret, &net_ns_desc) < 0) {
nsret = -1;
if (ret == 0)
goto err_ret;
}
return ret;
err_ret:
close(ns->net.seqsk);
err_sq:
if (ns->net.nlsk >= 0)
close(ns->net.nlsk);
err_nl:
goto out;
}
static int collect_net_ns(struct ns_id *ns, void *oarg)
{
bool for_dump = (oarg == (void *)1);
int ret;
pr_info("Collecting netns %d/%d\n", ns->id, ns->pid);
ret = prep_ns_sockets(ns, for_dump);
if (ret)
return ret;
if (!for_dump)
return 0;
return collect_sockets(ns);
}
int collect_net_namespaces(bool for_dump)
{
return walk_namespaces(&net_ns_desc, collect_net_ns,
(void *)(for_dump ? 1UL : 0));
}
struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net");
int move_veth_to_bridge(void)
{
int s;
int ret;
struct veth_pair *n;
struct ifreq ifr;
s = -1;
ret = 0;
list_for_each_entry(n, &opts.veth_pairs, node) {
if (n->bridge == NULL)
continue;
pr_debug("\tMoving dev %s to bridge %s\n", n->outside, n->bridge);
if (s == -1) {
s = socket(AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0);
if (s < 0) {
pr_perror("Can't create control socket");
return -1;
}
}
/*
* Add the device to the bridge. This is equivalent to:
* $ brctl addif <bridge> <device>
*/
ifr.ifr_ifindex = if_nametoindex(n->outside);
if (ifr.ifr_ifindex == 0) {
pr_perror("Can't get index of %s", n->outside);
ret = -1;
break;
}
strncpy(ifr.ifr_name, n->bridge, IFNAMSIZ);
ret = ioctl(s, SIOCBRADDIF, &ifr);
if (ret < 0) {
pr_perror("Can't add interface %s to bridge %s",
n->outside, n->bridge);
break;
}
/*
* Make sure the device is up. This is equivalent to:
* $ ip link set dev <device> up
*/
ifr.ifr_ifindex = 0;
strncpy(ifr.ifr_name, n->outside, IFNAMSIZ);
ret = ioctl(s, SIOCGIFFLAGS, &ifr);
if (ret < 0) {
pr_perror("Can't get flags of interface %s", n->outside);
break;
}
if (ifr.ifr_flags & IFF_UP)
continue;
ifr.ifr_flags |= IFF_UP;
ret = ioctl(s, SIOCSIFFLAGS, &ifr);
if (ret < 0) {
pr_perror("Can't set flags of interface %s to 0x%x",
n->outside, ifr.ifr_flags);
break;
}
}
if (s >= 0)
close(s);
return ret;
}