mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-30 22:05:36 +00:00
freeze_processes: fix logic
There are a few issues with the freeze_processes logic:
1. Commit 9fae23fbe2
grossly (by 1000x) miscalculated the number of
attempts required, as a result, we are seeing something like this:
> (00.000340) freezing processes: 100000 attempts with 100 ms steps
> (00.000351) freezer.state=THAWED
> (00.000358) freezer.state=FREEZING
> (00.100446) freezer.state=FREEZING
> ...close to 100 lines skipped...
> (09.915110) freezer.state=FREEZING
> (10.000432) Error (criu/cr-dump.c:1467): Timeout reached. Try to interrupt: 0
> (10.000563) freezer.state=FREEZING
For 10s with 100ms steps we only need 100 attempts, not 100000.
2. When the timeout is hit, the "failed to freeze cgroup" error is not
printed, and the log_unfrozen_stacks is not called either.
3. The nanosleep at the last iteration is useless (this was hidden by
issue 1 above, as the timeout was hit first).
Fix all these.
While at it,
4. Amend the error message with the number of attempts, sleep duration,
and timeout.
5. Modify the "freezing cgroup" debug message to be in sync with the
above error.
Was:
> freezing processes: 100000 attempts with 100 ms steps
Now:
> freezing cgroup some/name: 100 x 100ms attempts, timeout: 10s
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
committed by
Andrei Vagin
parent
f314ca5e1f
commit
9c3c095cfe
25
criu/seize.c
25
criu/seize.c
@@ -545,7 +545,8 @@ static int freeze_processes(void)
|
||||
enum freezer_state state = THAWED;
|
||||
|
||||
static const unsigned long step_ms = 100;
|
||||
unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms;
|
||||
/* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */
|
||||
unsigned long nr_attempts = (opts.timeout * 1000) / step_ms;
|
||||
unsigned long i = 0;
|
||||
|
||||
const struct timespec req = {
|
||||
@@ -554,14 +555,12 @@ static int freeze_processes(void)
|
||||
};
|
||||
|
||||
if (unlikely(!nr_attempts)) {
|
||||
/*
|
||||
* If timeout is turned off, lets
|
||||
* wait for at least 10 seconds.
|
||||
*/
|
||||
nr_attempts = (10 * 1000000) / step_ms;
|
||||
/* If the timeout is 0, wait for at least 10 seconds. */
|
||||
nr_attempts = (10 * 1000) / step_ms;
|
||||
}
|
||||
|
||||
pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms);
|
||||
pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n",
|
||||
opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout);
|
||||
|
||||
fd = freezer_open();
|
||||
if (fd < 0)
|
||||
@@ -588,22 +587,22 @@ static int freeze_processes(void)
|
||||
* not read @tasks pids while freezer in
|
||||
* transition stage.
|
||||
*/
|
||||
for (; i <= nr_attempts; i++) {
|
||||
while (1) {
|
||||
state = get_freezer_state(fd);
|
||||
if (state == FREEZER_ERROR) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (state == FROZEN)
|
||||
if (state == FROZEN || i++ == nr_attempts || alarm_timeouted())
|
||||
break;
|
||||
if (alarm_timeouted())
|
||||
goto err;
|
||||
|
||||
nanosleep(&req, NULL);
|
||||
}
|
||||
|
||||
if (i > nr_attempts) {
|
||||
pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
|
||||
if (state != FROZEN) {
|
||||
pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n",
|
||||
opts.freeze_cgroup, i, step_ms, opts.timeout);
|
||||
if (!pr_quelled(LOG_DEBUG))
|
||||
log_unfrozen_stacks(opts.freeze_cgroup);
|
||||
goto err;
|
||||
|
Reference in New Issue
Block a user