containers · openshift-merge-bot · Aug 15, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/docs/source/markdown/podman-wait.1.md.in b/docs/source/markdown/podman-wait.1.md.in
@@ -16,10 +16,13 @@ separated by newline in the same order as they were given to the command.  An
 exit code of -1 is emitted for all conditions other than "stopped" and
 "exited".
 
-NOTE: there is an inherent race condition when waiting for containers with a
-restart policy of `always` or `on-failure`, such as those created by `podman
-kube play`. Such containers may be repeatedly exiting and restarting, possibly
-with different exit codes, but `podman wait` can only display and detect one.
+When waiting for containers with a restart policy of `always` or `on-failure`,
+such as those created by `podman kube play`, the containers may be repeatedly
+exiting and restarting, possibly with different exit codes. `podman wait` will
+only display and detect the first exit after the wait command was started.
+
+When running a container with podman run --rm wait should wait for the
+container to be fully removed as well before it returns.
 
 ## OPTIONS
 

diff --git a/libpod/container_api.go b/libpod/container_api.go
@@ -541,119 +541,97 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 		}
 		return -1, define.ErrCtrRemoved
 	}
-	var conmonTimer time.Timer
-	conmonTimerSet := false
 
-	conmonPidFd := c.getConmonPidFd()
-	if conmonPidFd != -1 {
-		defer unix.Close(conmonPidFd)
-	}
-	conmonPidFdTriggered := false
-
-	getExitCode := func() (bool, int32, error) {
-		containerRemoved := false
-		if !c.batched {
-			c.lock.Lock()
-			defer c.lock.Unlock()
-		}
-
-		if err := c.syncContainer(); err != nil {
-			if !errors.Is(err, define.ErrNoSuchCtr) {
-				return false, -1, err
-			}
-			containerRemoved = true
-		}
-
-		// If conmon is not alive anymore set a timer to make sure
-		// we're returning even if conmon has forcefully been killed.
-		if !conmonTimerSet && !containerRemoved {
-			conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
-			switch {
-			case errors.Is(err, define.ErrNoSuchCtr):
-				// Container has been removed, so we assume the
-				// exit code is present in the DB.
-				containerRemoved = true
-			case err != nil:
-				return false, -1, err
-			case !conmonAlive:
-				// Give the exit code at most 20 seconds to
-				// show up in the DB.  That should largely be
-				// enough for the cleanup process.
-				timerDuration := time.Second * 20
-				conmonTimer = *time.NewTimer(timerDuration)
-				conmonTimerSet = true
-			case conmonAlive:
-				// Continue waiting if conmon's still running.
-				return false, -1, nil
-			}
-		}
+	if !c.batched {
+		c.lock.Lock()
+		defer c.lock.Unlock()
 
-		timedout := ""
-		if !containerRemoved {
-			// If conmon is dead for more than $timerDuration or if the
-			// container has exited properly, try to look up the exit code.
-			select {
-			case <-conmonTimer.C:
-				logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
-				timedout = " [exceeded conmon timeout waiting for container to exit]"
-			default:
-				switch c.state.State {
-				case define.ContainerStateExited, define.ContainerStateConfigured:
-					// Container exited, so we can look up the exit code.
-				case define.ContainerStateStopped:
-					// Continue looping unless the restart policy is always.
-					// In this case, the container would never transition to
-					// the exited state, so we need to look up the exit code.
-					if c.config.RestartPolicy != define.RestartPolicyAlways {
-						return false, -1, nil
-					}
-				default:
-					// Continue looping
-					return false, -1, nil
+		// Note this is one of the rare cases where we do not like to use syncContainer() as it does read the exit file
+		// We like to avoid that here because that means we might read it before container cleanup run and possible
+		// removed the container
+		if err := c.runtime.state.UpdateContainer(c); err != nil {
+			if errors.Is(err, define.ErrNoSuchCtr) {
+				// if the container is not valid at this point as it was deleted,
+				// check if the exit code was recorded in the db.
+				exitCode, err := c.runtime.state.GetContainerExitCode(id)
+				if err == nil {
+					return exitCode, nil
 				}
 			}
+			return -1, err
 		}
+	}
 
+	conmonPID := c.state.ConmonPID
+	// conmonPID == 0 means container is not running
+	if conmonPID == 0 {
 		exitCode, err := c.runtime.state.GetContainerExitCode(id)
 		if err != nil {
 			if errors.Is(err, define.ErrNoSuchExitCode) {
 				// If the container is configured or created, we must assume it never ran.
 				if c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
-					return true, 0, nil
+					return 0, nil
 				}
 			}
-			return true, -1, fmt.Errorf("%w (container in state %s)%s", err, c.state.State, timedout)
+			return -1, err
 		}
+		return exitCode, nil
+	}
 
-		return true, exitCode, nil
+	conmonPidFd := c.getConmonPidFd()
+	if conmonPidFd > -1 {
+		defer unix.Close(conmonPidFd)
 	}
 
-	for {
-		hasExited, exitCode, err := getExitCode()
-		if hasExited {
-			return exitCode, err
+	// we cannot wait locked as we would hold the lock forever, so we unlock and then lock again
+	c.lock.Unlock()
+	err := waitForConmonExit(conmonPID, conmonPidFd, pollInterval)
+	c.lock.Lock()
+	if err != nil {
+		return -1, fmt.Errorf("failed to wait for conmon to exit: %w", err)
+	}
+
+	// we locked again so we must sync the state
+	if err := c.syncContainer(); err != nil {
+		if errors.Is(err, define.ErrNoSuchCtr) {
+			// if the container is not valid at this point as it was deleted,
+			// check if the exit code was recorded in the db.
+			exitCode, err := c.runtime.state.GetContainerExitCode(id)
+			if err == nil {
+				return exitCode, nil
+			}
 		}
-		if err != nil {
-			return -1, err
+		return -1, err
+	}
+
+	// syncContainer already did the exit file read so nothing extra for us to do here
+	return c.runtime.state.GetContainerExitCode(id)
+}
+
+func waitForConmonExit(conmonPID, conmonPidFd int, pollInterval time.Duration) error {
+	if conmonPidFd > -1 {
+		for {
+			fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}}
+			if _, err := unix.Poll(fds, -1); err != nil {
+				if err == unix.EINTR {
+					continue
+				}
+				return err
+			}
+			return nil
 		}
-		select {
-		case <-ctx.Done():
-			return -1, fmt.Errorf("waiting for exit code of container %s canceled", id)
-		default:
-			if conmonPidFd != -1 && !conmonPidFdTriggered {
-				// If possible (pidfd works), the first cycle we block until conmon dies
-				// If this happens, and we fall back to the old poll delay
-				// There is a deadlock in the cleanup code for "play kube" which causes
-				// conmon to not exit, so unfortunately we have to use the poll interval
-				// timeout here to avoid hanging.
-				fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}}
-				_, _ = unix.Poll(fds, int(pollInterval.Milliseconds()))
-				conmonPidFdTriggered = true
-			} else {
-				time.Sleep(pollInterval)
+	}
+	// no pidfd support, we must poll the pid
+	for {
+		if err := unix.Kill(conmonPID, 0); err != nil {
+			if err == unix.ESRCH {
+				break
 			}
+			return err
 		}
+		time.Sleep(pollInterval)
 	}
+	return nil
 }
 
 type waitResult struct {

diff --git a/libpod/container_internal.go b/libpod/container_internal.go
@@ -2163,6 +2163,11 @@ func (c *Container) stopPodIfNeeded(ctx context.Context) error {
 		return nil
 	}
 
+	// Never try to stop the pod when a init container stopped
+	if c.IsInitCtr() {
+		return nil
+	}
+
 	pod, err := c.runtime.state.Pod(c.config.Pod)
 	if err != nil {
 		return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err)

diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go
@@ -694,16 +694,14 @@ func (c *Container) makePlatformBindMounts() error {
 }
 
 func (c *Container) getConmonPidFd() int {
-	if c.state.ConmonPID != 0 {
-		// Track lifetime of conmon precisely using pidfd_open + poll.
-		// There are many cases for this to fail, for instance conmon is dead
-		// or pidfd_open is not supported (pre linux 5.3), so fall back to the
-		// traditional loop with poll + sleep
-		if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
-			return fd
-		} else if err != unix.ENOSYS && err != unix.ESRCH {
-			logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
-		}
+	// Track lifetime of conmon precisely using pidfd_open + poll.
+	// There are many cases for this to fail, for instance conmon is dead
+	// or pidfd_open is not supported (pre linux 5.3), so fall back to the
+	// traditional loop with poll + sleep
+	if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
+		return fd
+	} else if err != unix.ENOSYS && err != unix.ESRCH {
+		logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
 	}
 	return -1
 }

diff --git a/libpod/runtime_volume.go b/libpod/runtime_volume.go
@@ -28,13 +28,6 @@ func (r *Runtime) RemoveVolume(ctx context.Context, v *Volume, force bool, timeo
 		return define.ErrRuntimeStopped
 	}
 
-	if !v.valid {
-		if ok, _ := r.state.HasVolume(v.Name()); !ok {
-			// Volume probably already removed
-			// Or was never in the runtime to begin with
-			return nil
-		}
-	}
 	return r.removeVolume(ctx, v, force, timeout, false)
 }
 

diff --git a/libpod/runtime_volume_common.go b/libpod/runtime_volume_common.go
@@ -357,13 +357,11 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo
 		return define.ErrVolumeRemoved
 	}
 
-	v.lock.Lock()
-	defer v.lock.Unlock()
-
-	// Update volume status to pick up a potential removal from state
-	if err := v.update(); err != nil {
-		return err
-	}
+	// DANGEROUS: Do not lock here yet because we might needed to remove containers first.
+	// In general we must always acquire the ctr lock before a volume lock so we cannot lock.
+	// THIS MUST BE DONE to prevent ABBA deadlocks.
+	// It also means the are several races around creating containers with volumes and removing
+	// them in parallel. However that problem exists regadless of taking the lock here or not.
 
 	deps, err := r.state.VolumeInUse(v)
 	if err != nil {
@@ -400,6 +398,15 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo
 		}
 	}
 
+	// Ok now we are good to lock.
+	v.lock.Lock()
+	defer v.lock.Unlock()
+
+	// Update volume status to pick up a potential removal from state
+	if err := v.update(); err != nil {
+		return err
+	}
+
 	// If the volume is still mounted - force unmount it
 	if err := v.unmount(true); err != nil {
 		if force {

diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -1289,13 +1289,15 @@ EOF
     rm -rf $romount
 }
 
-@test "podman run --restart=always -- wait" {
+@test "podman run --restart=always/on-failure -- wait" {
     # regression test for #18572 to make sure Podman waits less than 20 seconds
     ctr=c_$(safename)
-    run_podman run -d --restart=always --name=$ctr $IMAGE false
-    PODMAN_TIMEOUT=20 run_podman wait $ctr
-    is "$output" "1" "container should exit 1"
-    run_podman rm -f -t0 $ctr
+    for policy in always on-failure; do
+        run_podman run -d --restart=$policy --name=$ctr $IMAGE false
+        PODMAN_TIMEOUT=20 run_podman wait $ctr
+        is "$output" "1" "container should exit 1 (policy: $policy)"
+        run_podman rm -f -t0 $ctr
+    done
 }
 
 @test "podman run - custom static_dir" {

diff --git a/test/system/500-networking.bats b/test/system/500-networking.bats
@@ -935,6 +935,27 @@ EOF
      fi
 }
 
+function wait_for_restart_count() {
+    local cname="$1"
+    local count="$2"
+    local tname="$3"
+
+    local timeout=10
+    while :; do
+        # Previously this would fail as the container would run out of ips after 5 restarts.
+        run_podman inspect --format "{{.RestartCount}}" $cname
+        if [[ "$output" == "$2" ]]; then
+            break
+        fi
+
+        timeout=$((timeout - 1))
+        if [[ $timeout -eq 0 ]]; then
+            die "Timed out waiting for RestartCount with $tname"
+        fi
+        sleep 0.5
+    done
+}
+
 # Test for https://github.com/containers/podman/issues/18615
 @test "podman network cleanup --userns + --restart" {
     skip_if_cgroupsv1 "run --uidmap fails on cgroups v1 (issue 15025, wontfix)"
@@ -955,10 +976,8 @@ EOF
     # This will cause 7 containers runs with the restart policy (one more than the on failure limit)
     # Since they run sequentially they should run fine without allocating all ips.
     run_podman 1 run --name $cname --network $net1 --restart on-failure:6 --userns keep-id $IMAGE false
-
-    # Previously this would fail as the container would run out of ips after 5 restarts.
-    run_podman inspect --format "{{.RestartCount}}" $cname
-    assert "$output" == "6" "RestartCount for failing container with bridge network"
+    wait_for_restart_count $cname 6 "custom network"
+    run_podman wait $cname
 
     # Now make sure we can still run a container with free ips.
     run_podman run --rm --network $net1 $IMAGE true
@@ -969,23 +988,23 @@ EOF
     if has_slirp4netns; then
         cname2=con2-$(random_string 10)
         run_podman 1 run --name $cname2 --network slirp4netns --restart on-failure:2 --userns keep-id $IMAGE false
-        run_podman inspect --format "{{.RestartCount}}" $cname2
-        assert "$output" == "2" "RestartCount for failing container with slirp4netns"
+        wait_for_restart_count $cname2 2 "slirp4netns"
+        run_podman wait $cname2
     fi
 
     if is_rootless; then
         # pasta can only run rootless
         cname3=con3-$(random_string 10)
         run_podman 1 run --name $cname3 --network pasta --restart on-failure:2 --userns keep-id $IMAGE false
-        run_podman inspect --format "{{.RestartCount}}" $cname3
-        assert "$output" == "2" "RestartCount for failing container with pasta"
+        wait_for_restart_count $cname3 2 "pasta"
+        run_podman wait $cname3
     else
         # This is racy if other programs modify /run/netns while the test is running.
         # However I think the risk is minimal and I think checking for this is important.
         assert "$(ls /run/netns | wc -l)" == "$netns_count" "/run/netns has no leaked netns files"
     fi
 
-    run_podman rm -f -t0 $cname $cname2 $cname3
+    run_podman rm $cname $cname2 $cname3
     run_podman network rm $net1
 }