2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00

freeze_processes: fix logic

There are a few issues with the freeze_processes logic:

1. Commit 9fae23fbe2 grossly (by 1000x) miscalculated the number of
   attempts required, as a result, we are seeing something like this:

> (00.000340) freezing processes: 100000 attempts with 100 ms steps
> (00.000351) freezer.state=THAWED
> (00.000358) freezer.state=FREEZING
> (00.100446) freezer.state=FREEZING
> ...close to 100 lines skipped...
> (09.915110) freezer.state=FREEZING
> (10.000432) Error (criu/cr-dump.c:1467): Timeout reached. Try to interrupt: 0
> (10.000563) freezer.state=FREEZING

   For 10s with 100ms steps we only need 100 attempts, not 100000.

2. When the timeout is hit, the "failed to freeze cgroup" error is not
   printed, and the log_unfrozen_stacks is not called either.

3. The nanosleep at the last iteration is useless (this was hidden by
   issue 1 above, as the timeout was hit first).

Fix all these.

While at it,

4. Amend the error message with the number of attempts, sleep duration,
   and timeout.

5. Modify the "freezing cgroup" debug message to be in sync with the
   above error.

   Was:

   > freezing processes: 100000 attempts with 100 ms steps

   Now:

   > freezing cgroup some/name: 100 x 100ms attempts, timeout: 10s

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Kir Kolyshkin 2024-12-12 17:29:34 -08:00 committed by Andrei Vagin
parent f314ca5e1f
commit 9c3c095cfe

View File

@ -545,7 +545,8 @@ static int freeze_processes(void)
enum freezer_state state = THAWED; enum freezer_state state = THAWED;
static const unsigned long step_ms = 100; static const unsigned long step_ms = 100;
unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; /* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */
unsigned long nr_attempts = (opts.timeout * 1000) / step_ms;
unsigned long i = 0; unsigned long i = 0;
const struct timespec req = { const struct timespec req = {
@ -554,14 +555,12 @@ static int freeze_processes(void)
}; };
if (unlikely(!nr_attempts)) { if (unlikely(!nr_attempts)) {
/* /* If the timeout is 0, wait for at least 10 seconds. */
* If timeout is turned off, lets nr_attempts = (10 * 1000) / step_ms;
* wait for at least 10 seconds.
*/
nr_attempts = (10 * 1000000) / step_ms;
} }
pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms); pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n",
opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout);
fd = freezer_open(); fd = freezer_open();
if (fd < 0) if (fd < 0)
@ -588,22 +587,22 @@ static int freeze_processes(void)
* not read @tasks pids while freezer in * not read @tasks pids while freezer in
* transition stage. * transition stage.
*/ */
for (; i <= nr_attempts; i++) { while (1) {
state = get_freezer_state(fd); state = get_freezer_state(fd);
if (state == FREEZER_ERROR) { if (state == FREEZER_ERROR) {
close(fd); close(fd);
return -1; return -1;
} }
if (state == FROZEN) if (state == FROZEN || i++ == nr_attempts || alarm_timeouted())
break; break;
if (alarm_timeouted())
goto err;
nanosleep(&req, NULL); nanosleep(&req, NULL);
} }
if (i > nr_attempts) { if (state != FROZEN) {
pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n",
opts.freeze_cgroup, i, step_ms, opts.timeout);
if (!pr_quelled(LOG_DEBUG)) if (!pr_quelled(LOG_DEBUG))
log_unfrozen_stacks(opts.freeze_cgroup); log_unfrozen_stacks(opts.freeze_cgroup);
goto err; goto err;