docker-test: handle race condition error

There is a race condition in docker/containerd that causes docker to occasionally fail when starting a container from a checkpoint immediately after the checkpoint has been created. This problem is unrelated to criu and has been reported in https://github.com/moby/moby/issues/42900 Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
2025-08-30 22:05:36 +00:00 · 2022-08-11 09:51:34 +01:00
parent 49319cd579
commit 2642b657da
1 changed files with 25 additions and 5 deletions
--- a/scripts/ci/docker-test.sh
+++ b/scripts/ci/docker-test.sh
@@ -75,17 +75,37 @@ checkpoint_container () {
 	docker wait cr
 }

-restore_container () {
-	CHECKPOINT_NAME=$1
-
-	docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || {
+print_logs () {
 	cat "$(grep log 'log file:' | sed 's/log file:\s*//')" || true
 		docker logs cr || true
 		cat $CRIU_LOG || true
 		dmesg
 		docker ps
 		exit 1
-	}
+}
+
+declare -i max_restore_container_tries=3
+current_iteration=
+
+restore_container () {
+	CHECKPOINT_NAME=$1
+
+	docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || {
+		# FIXME: There is a race condition in docker/containerd that causes
+		# docker to occasionally fail when starting a container from a
+		# checkpoint immediately after the checkpoint has been created.
+		# https://github.com/moby/moby/issues/42900
+		if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then
+			print_logs
+		fi
+		grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && {
+			((current_iteration+=1))
+			echo "Retry container restore: $current_iteration"
+			sleep 1;
+			restore_container "$CHECKPOINT_NAME"
+		} ||
+		print_logs
+	} && current_iteration=0
 }

 # Scenario: Create multiple containers and checkpoint and restore them once