mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-22 01:51:51 +00:00
images/inventory: add field for enabled plugins
This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be *enabled* during restore. - When the field is empty, it indicates that no plugins were used during checkpointing. Thus, all plugins can be *disabled* during restore. Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
parent
87b5ac9d9f
commit
adf2c5be96
@ -2354,12 +2354,12 @@ int cr_restore_tasks(void)
|
||||
if (init_service_fd())
|
||||
return 1;
|
||||
|
||||
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
|
||||
return -1;
|
||||
|
||||
if (check_img_inventory(/* restore = */ true) < 0)
|
||||
goto err;
|
||||
|
||||
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
|
||||
return -1;
|
||||
|
||||
if (init_stats(RESTORE_STATS))
|
||||
goto err;
|
||||
|
||||
|
124
criu/image.c
124
criu/image.c
@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids;
|
||||
u32 root_cg_set;
|
||||
Lsmtype image_lsm;
|
||||
|
||||
struct inventory_plugin {
|
||||
struct list_head node;
|
||||
char *name;
|
||||
};
|
||||
|
||||
struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list);
|
||||
static int n_inventory_plugins;
|
||||
|
||||
int check_img_inventory(bool restore)
|
||||
{
|
||||
int ret = -1;
|
||||
@ -99,6 +107,19 @@ int check_img_inventory(bool restore)
|
||||
} else {
|
||||
opts.network_lock_method = he->network_lock_method;
|
||||
}
|
||||
|
||||
if (!he->plugins_entry) {
|
||||
/* backwards compatibility: if the 'plugins_entry' field is missing,
|
||||
* all plugins should be enabled during restore.
|
||||
*/
|
||||
n_inventory_plugins = -1;
|
||||
} else {
|
||||
PluginsEntry *pe = he->plugins_entry;
|
||||
for (int i = 0; i < pe->n_plugins; i++) {
|
||||
if (add_inventory_plugin(pe->plugins[i]))
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
@ -110,8 +131,92 @@ out_close:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the 'plugins' field in the inventory image contains
|
||||
* the specified plugin name. If found, the plugin is removed
|
||||
* from the linked list.
|
||||
*/
|
||||
bool check_and_remove_inventory_plugin(const char *name, size_t n)
|
||||
{
|
||||
if (n_inventory_plugins == -1)
|
||||
return true; /* backwards compatibility */
|
||||
|
||||
if (n_inventory_plugins > 0) {
|
||||
struct inventory_plugin *p, *tmp;
|
||||
|
||||
list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) {
|
||||
if (!strncmp(name, p->name, n)) {
|
||||
xfree(p->name);
|
||||
list_del(&p->node);
|
||||
xfree(p);
|
||||
n_inventory_plugins--;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* We expect during restore all loaded plugins to be removed from
|
||||
* the inventory_plugins_list. If the list is not empty, show an
|
||||
* error message for each missing plugin.
|
||||
*/
|
||||
int check_inventory_plugins(void)
|
||||
{
|
||||
struct inventory_plugin *p;
|
||||
|
||||
if (n_inventory_plugins <= 0)
|
||||
return 0;
|
||||
|
||||
list_for_each_entry(p, &inventory_plugins_list, node) {
|
||||
pr_err("Missing required plugin: %s\n", p->name);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add plugin name to the inventory image. These values
|
||||
* can be used to identify required plugins during restore.
|
||||
*/
|
||||
int add_inventory_plugin(const char *name)
|
||||
{
|
||||
struct inventory_plugin *p;
|
||||
|
||||
p = xmalloc(sizeof(struct inventory_plugin));
|
||||
if (p == NULL)
|
||||
return -1;
|
||||
|
||||
p->name = xstrdup(name);
|
||||
if (!p->name) {
|
||||
xfree(p);
|
||||
return -1;
|
||||
}
|
||||
list_add(&p->node, &inventory_plugins_list);
|
||||
n_inventory_plugins++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void free_inventory_plugins_list(void)
|
||||
{
|
||||
struct inventory_plugin *p, *tmp;
|
||||
|
||||
if (!list_empty(&inventory_plugins_list)) {
|
||||
list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) {
|
||||
xfree(p->name);
|
||||
list_del(&p->node);
|
||||
xfree(p);
|
||||
}
|
||||
}
|
||||
n_inventory_plugins = 0;
|
||||
}
|
||||
|
||||
int write_img_inventory(InventoryEntry *he)
|
||||
{
|
||||
PluginsEntry pe = PLUGINS_ENTRY__INIT;
|
||||
struct cr_img *img;
|
||||
int ret;
|
||||
|
||||
@ -121,8 +226,27 @@ int write_img_inventory(InventoryEntry *he)
|
||||
if (!img)
|
||||
return -1;
|
||||
|
||||
if (!list_empty(&inventory_plugins_list)) {
|
||||
struct inventory_plugin *p;
|
||||
int i = 0;
|
||||
|
||||
pe.n_plugins = n_inventory_plugins;
|
||||
pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *));
|
||||
if (!pe.plugins)
|
||||
return -1;
|
||||
|
||||
list_for_each_entry(p, &inventory_plugins_list, node) {
|
||||
pe.plugins[i] = p->name;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
he->plugins_entry = &pe;
|
||||
|
||||
ret = pb_write_one(img, he, PB_INVENTORY);
|
||||
|
||||
free_inventory_plugins_list();
|
||||
xfree(pe.plugins);
|
||||
|
||||
xfree(he->root_ids);
|
||||
close_image(img);
|
||||
if (ret < 0)
|
||||
|
@ -177,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size);
|
||||
|
||||
extern void close_image(struct cr_img *);
|
||||
|
||||
extern int add_inventory_plugin(const char *name);
|
||||
extern int check_inventory_plugins(void);
|
||||
extern bool check_and_remove_inventory_plugin(const char *name, size_t n);
|
||||
|
||||
#endif /* __CR_IMAGE_H__ */
|
||||
|
@ -256,6 +256,9 @@ int cr_plugin_init(int stage)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins())
|
||||
goto err;
|
||||
|
||||
exit_code = 0;
|
||||
err:
|
||||
closedir(d);
|
||||
|
@ -10,6 +10,13 @@ enum lsmtype {
|
||||
APPARMOR = 2;
|
||||
}
|
||||
|
||||
// It is not possible to distinguish between an empty repeated field
|
||||
// and unset repeated field. To solve this problem and provide backwards
|
||||
// compabibility, we use the 'plugins_entry' message.
|
||||
message plugins_entry {
|
||||
repeated string plugins = 12;
|
||||
};
|
||||
|
||||
message inventory_entry {
|
||||
required uint32 img_version = 1;
|
||||
optional bool fdinfo_per_id = 2;
|
||||
@ -21,4 +28,5 @@ message inventory_entry {
|
||||
optional uint32 pre_dump_mode = 9;
|
||||
optional bool tcp_close = 10;
|
||||
optional uint32 network_lock_method = 11;
|
||||
optional plugins_entry plugins_entry = 12;
|
||||
}
|
||||
|
@ -60,6 +60,10 @@ static LIST_HEAD(update_vma_info_list);
|
||||
|
||||
size_t kfd_max_buffer_size;
|
||||
|
||||
bool plugin_added_to_inventory = false;
|
||||
|
||||
bool plugin_disabled = false;
|
||||
|
||||
/**************************************************************************************************/
|
||||
|
||||
/* Call ioctl, restarting if it is interrupted */
|
||||
@ -332,6 +336,13 @@ void getenv_size_t(const char *var, size_t *value)
|
||||
|
||||
int amdgpu_plugin_init(int stage)
|
||||
{
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE) {
|
||||
if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) {
|
||||
plugin_disabled = true;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
|
||||
|
||||
topology_init(&src_topology);
|
||||
@ -365,6 +376,9 @@ int amdgpu_plugin_init(int stage)
|
||||
|
||||
void amdgpu_plugin_fini(int stage, int ret)
|
||||
{
|
||||
if (plugin_disabled)
|
||||
return;
|
||||
|
||||
pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
|
||||
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE)
|
||||
@ -414,6 +428,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
|
||||
if (ret)
|
||||
pr_perror("%s(), Can't handle VMAs of input device", __func__);
|
||||
|
||||
if (!ret && !plugin_added_to_inventory) {
|
||||
ret = add_inventory_plugin(CR_PLUGIN_DESC.name);
|
||||
if (ret)
|
||||
pr_err("Failed to add AMDGPU plugin to inventory image\n");
|
||||
else
|
||||
plugin_added_to_inventory = true;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
|
||||
@ -1540,6 +1562,9 @@ int amdgpu_plugin_restore_file(int id)
|
||||
size_t img_size;
|
||||
FILE *img_fp = NULL;
|
||||
|
||||
if (plugin_disabled)
|
||||
return -ENOTSUP;
|
||||
|
||||
pr_info("Initialized kfd plugin restorer with ID = %d\n", id);
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
|
||||
@ -1746,6 +1771,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const
|
||||
char *p_end;
|
||||
bool is_kfd = false, is_renderD = false;
|
||||
|
||||
if (plugin_disabled)
|
||||
return -ENOTSUP;
|
||||
|
||||
plugin_log_msg("Enter %s\n", __func__);
|
||||
|
||||
strncpy(path, in_path, sizeof(path));
|
||||
@ -1805,6 +1833,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
|
||||
struct kfd_ioctl_criu_args args = { 0 };
|
||||
int fd, exit_code = 0;
|
||||
|
||||
if (plugin_disabled)
|
||||
return -ENOTSUP;
|
||||
|
||||
pr_info("Inside %s for target pid = %d\n", __func__, target_pid);
|
||||
|
||||
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
|
||||
|
@ -38,6 +38,8 @@
|
||||
*/
|
||||
bool plugin_disabled = false;
|
||||
|
||||
bool plugin_added_to_inventory = false;
|
||||
|
||||
struct pid_info {
|
||||
int pid;
|
||||
char checkpointed;
|
||||
@ -319,7 +321,7 @@ int cuda_plugin_checkpoint_devices(int pid)
|
||||
k_rtsigset_t save_sigset;
|
||||
|
||||
if (plugin_disabled) {
|
||||
return 0;
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
restore_tid = get_cuda_restore_tid(pid);
|
||||
@ -354,6 +356,15 @@ int cuda_plugin_checkpoint_devices(int pid)
|
||||
pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid);
|
||||
}
|
||||
}
|
||||
|
||||
if (!status && !plugin_added_to_inventory) {
|
||||
status = add_inventory_plugin(CR_PLUGIN_DESC.name);
|
||||
if (status)
|
||||
pr_err("Failed to add CUDA plugin to inventory image\n");
|
||||
else
|
||||
plugin_added_to_inventory = true;
|
||||
}
|
||||
|
||||
interrupt:
|
||||
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
|
||||
|
||||
@ -367,7 +378,7 @@ int cuda_plugin_pause_devices(int pid)
|
||||
char msg_buf[CUDA_CKPT_BUF_SIZE];
|
||||
|
||||
if (plugin_disabled) {
|
||||
return 0;
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
restore_tid = get_cuda_restore_tid(pid);
|
||||
@ -463,6 +474,13 @@ int cuda_plugin_init(int stage)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE) {
|
||||
if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) {
|
||||
plugin_disabled = true;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
|
||||
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
|
||||
plugin_disabled = true;
|
||||
|
Loading…
x
Reference in New Issue
Block a user