mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-22 01:51:51 +00:00
freeze_processes: fix logic
There are a few issues with the freeze_processes logic: 1. Commit 9fae23fbe2 grossly (by 1000x) miscalculated the number of attempts required, as a result, we are seeing something like this: > (00.000340) freezing processes: 100000 attempts with 100 ms steps > (00.000351) freezer.state=THAWED > (00.000358) freezer.state=FREEZING > (00.100446) freezer.state=FREEZING > ...close to 100 lines skipped... > (09.915110) freezer.state=FREEZING > (10.000432) Error (criu/cr-dump.c:1467): Timeout reached. Try to interrupt: 0 > (10.000563) freezer.state=FREEZING For 10s with 100ms steps we only need 100 attempts, not 100000. 2. When the timeout is hit, the "failed to freeze cgroup" error is not printed, and the log_unfrozen_stacks is not called either. 3. The nanosleep at the last iteration is useless (this was hidden by issue 1 above, as the timeout was hit first). Fix all these. While at it, 4. Amend the error message with the number of attempts, sleep duration, and timeout. 5. Modify the "freezing cgroup" debug message to be in sync with the above error. Was: > freezing processes: 100000 attempts with 100 ms steps Now: > freezing cgroup some/name: 100 x 100ms attempts, timeout: 10s Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
parent
f314ca5e1f
commit
9c3c095cfe
25
criu/seize.c
25
criu/seize.c
@ -545,7 +545,8 @@ static int freeze_processes(void)
|
||||
enum freezer_state state = THAWED;
|
||||
|
||||
static const unsigned long step_ms = 100;
|
||||
unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms;
|
||||
/* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */
|
||||
unsigned long nr_attempts = (opts.timeout * 1000) / step_ms;
|
||||
unsigned long i = 0;
|
||||
|
||||
const struct timespec req = {
|
||||
@ -554,14 +555,12 @@ static int freeze_processes(void)
|
||||
};
|
||||
|
||||
if (unlikely(!nr_attempts)) {
|
||||
/*
|
||||
* If timeout is turned off, lets
|
||||
* wait for at least 10 seconds.
|
||||
*/
|
||||
nr_attempts = (10 * 1000000) / step_ms;
|
||||
/* If the timeout is 0, wait for at least 10 seconds. */
|
||||
nr_attempts = (10 * 1000) / step_ms;
|
||||
}
|
||||
|
||||
pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms);
|
||||
pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n",
|
||||
opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout);
|
||||
|
||||
fd = freezer_open();
|
||||
if (fd < 0)
|
||||
@ -588,22 +587,22 @@ static int freeze_processes(void)
|
||||
* not read @tasks pids while freezer in
|
||||
* transition stage.
|
||||
*/
|
||||
for (; i <= nr_attempts; i++) {
|
||||
while (1) {
|
||||
state = get_freezer_state(fd);
|
||||
if (state == FREEZER_ERROR) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (state == FROZEN)
|
||||
if (state == FROZEN || i++ == nr_attempts || alarm_timeouted())
|
||||
break;
|
||||
if (alarm_timeouted())
|
||||
goto err;
|
||||
|
||||
nanosleep(&req, NULL);
|
||||
}
|
||||
|
||||
if (i > nr_attempts) {
|
||||
pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
|
||||
if (state != FROZEN) {
|
||||
pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n",
|
||||
opts.freeze_cgroup, i, step_ms, opts.timeout);
|
||||
if (!pr_quelled(LOG_DEBUG))
|
||||
log_unfrozen_stacks(opts.freeze_cgroup);
|
||||
goto err;
|
||||
|
Loading…
x
Reference in New Issue
Block a user