Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions docs/source/markdown/podman-wait.1.md.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@ separated by newline in the same order as they were given to the command. An
exit code of -1 is emitted for all conditions other than "stopped" and
"exited".

NOTE: there is an inherent race condition when waiting for containers with a
restart policy of `always` or `on-failure`, such as those created by `podman
kube play`. Such containers may be repeatedly exiting and restarting, possibly
with different exit codes, but `podman wait` can only display and detect one.
When waiting for containers with a restart policy of `always` or `on-failure`,
such as those created by `podman kube play`, the containers may be repeatedly
exiting and restarting, possibly with different exit codes. `podman wait` will
only display and detect the first exit after the wait command was started.

When running a container with podman run --rm wait should wait for the
container to be fully removed as well before it returns.

## OPTIONS

Expand Down
160 changes: 69 additions & 91 deletions libpod/container_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,119 +541,97 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
}
return -1, define.ErrCtrRemoved
}
var conmonTimer time.Timer
conmonTimerSet := false

conmonPidFd := c.getConmonPidFd()
if conmonPidFd != -1 {
defer unix.Close(conmonPidFd)
}
conmonPidFdTriggered := false

getExitCode := func() (bool, int32, error) {
containerRemoved := false
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
}

if err := c.syncContainer(); err != nil {
if !errors.Is(err, define.ErrNoSuchCtr) {
return false, -1, err
}
containerRemoved = true
}

// If conmon is not alive anymore set a timer to make sure
// we're returning even if conmon has forcefully been killed.
if !conmonTimerSet && !containerRemoved {
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
switch {
case errors.Is(err, define.ErrNoSuchCtr):
// Container has been removed, so we assume the
// exit code is present in the DB.
containerRemoved = true
case err != nil:
return false, -1, err
case !conmonAlive:
// Give the exit code at most 20 seconds to
// show up in the DB. That should largely be
// enough for the cleanup process.
timerDuration := time.Second * 20
conmonTimer = *time.NewTimer(timerDuration)
conmonTimerSet = true
case conmonAlive:
// Continue waiting if conmon's still running.
return false, -1, nil
}
}
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()

timedout := ""
if !containerRemoved {
// If conmon is dead for more than $timerDuration or if the
// container has exited properly, try to look up the exit code.
select {
case <-conmonTimer.C:
logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
timedout = " [exceeded conmon timeout waiting for container to exit]"
default:
switch c.state.State {
case define.ContainerStateExited, define.ContainerStateConfigured:
// Container exited, so we can look up the exit code.
case define.ContainerStateStopped:
// Continue looping unless the restart policy is always.
// In this case, the container would never transition to
// the exited state, so we need to look up the exit code.
if c.config.RestartPolicy != define.RestartPolicyAlways {
return false, -1, nil
}
default:
// Continue looping
return false, -1, nil
// Note this is one of the rare cases where we do not like to use syncContainer() as it does read the exit file
// We like to avoid that here because that means we might read it before container cleanup run and possible
// removed the container
if err := c.runtime.state.UpdateContainer(c); err != nil {
if errors.Is(err, define.ErrNoSuchCtr) {
// if the container is not valid at this point as it was deleted,
// check if the exit code was recorded in the db.
exitCode, err := c.runtime.state.GetContainerExitCode(id)
if err == nil {
return exitCode, nil
}
}
return -1, err
}
}

conmonPID := c.state.ConmonPID
// conmonPID == 0 means container is not running
if conmonPID == 0 {
exitCode, err := c.runtime.state.GetContainerExitCode(id)
if err != nil {
if errors.Is(err, define.ErrNoSuchExitCode) {
// If the container is configured or created, we must assume it never ran.
if c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
return true, 0, nil
return 0, nil
}
}
return true, -1, fmt.Errorf("%w (container in state %s)%s", err, c.state.State, timedout)
return -1, err
}
return exitCode, nil
}

return true, exitCode, nil
conmonPidFd := c.getConmonPidFd()
if conmonPidFd > -1 {
defer unix.Close(conmonPidFd)
}

for {
hasExited, exitCode, err := getExitCode()
if hasExited {
return exitCode, err
// we cannot wait locked as we would hold the lock forever, so we unlock and then lock again
c.lock.Unlock()
err := waitForConmonExit(conmonPID, conmonPidFd, pollInterval)
c.lock.Lock()
if err != nil {
return -1, fmt.Errorf("failed to wait for conmon to exit: %w", err)
}

// we locked again so we must sync the state
if err := c.syncContainer(); err != nil {
if errors.Is(err, define.ErrNoSuchCtr) {
// if the container is not valid at this point as it was deleted,
// check if the exit code was recorded in the db.
exitCode, err := c.runtime.state.GetContainerExitCode(id)
if err == nil {
return exitCode, nil
}
}
if err != nil {
return -1, err
return -1, err
}

// syncContainer already did the exit file read so nothing extra for us to do here
return c.runtime.state.GetContainerExitCode(id)
}

func waitForConmonExit(conmonPID, conmonPidFd int, pollInterval time.Duration) error {
if conmonPidFd > -1 {
for {
fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}}
if _, err := unix.Poll(fds, -1); err != nil {
if err == unix.EINTR {
continue
}
return err
}
return nil
}
select {
case <-ctx.Done():
return -1, fmt.Errorf("waiting for exit code of container %s canceled", id)
default:
if conmonPidFd != -1 && !conmonPidFdTriggered {
// If possible (pidfd works), the first cycle we block until conmon dies
// If this happens, and we fall back to the old poll delay
// There is a deadlock in the cleanup code for "play kube" which causes
// conmon to not exit, so unfortunately we have to use the poll interval
// timeout here to avoid hanging.
fds := []unix.PollFd{{Fd: int32(conmonPidFd), Events: unix.POLLIN}}
_, _ = unix.Poll(fds, int(pollInterval.Milliseconds()))
conmonPidFdTriggered = true
} else {
time.Sleep(pollInterval)
}
// no pidfd support, we must poll the pid
for {
if err := unix.Kill(conmonPID, 0); err != nil {
if err == unix.ESRCH {
break
}
return err
}
time.Sleep(pollInterval)
}
return nil
}

type waitResult struct {
Expand Down
5 changes: 5 additions & 0 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -2163,6 +2163,11 @@ func (c *Container) stopPodIfNeeded(ctx context.Context) error {
return nil
}

// Never try to stop the pod when a init container stopped
if c.IsInitCtr() {
return nil
}

pod, err := c.runtime.state.Pod(c.config.Pod)
if err != nil {
return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err)
Expand Down
18 changes: 8 additions & 10 deletions libpod/container_internal_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -694,16 +694,14 @@ func (c *Container) makePlatformBindMounts() error {
}

func (c *Container) getConmonPidFd() int {
if c.state.ConmonPID != 0 {
// Track lifetime of conmon precisely using pidfd_open + poll.
// There are many cases for this to fail, for instance conmon is dead
// or pidfd_open is not supported (pre linux 5.3), so fall back to the
// traditional loop with poll + sleep
if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
return fd
} else if err != unix.ENOSYS && err != unix.ESRCH {
logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
}
// Track lifetime of conmon precisely using pidfd_open + poll.
// There are many cases for this to fail, for instance conmon is dead
// or pidfd_open is not supported (pre linux 5.3), so fall back to the
// traditional loop with poll + sleep
if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
return fd
} else if err != unix.ENOSYS && err != unix.ESRCH {
logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
}
return -1
}
Expand Down
7 changes: 0 additions & 7 deletions libpod/runtime_volume.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,6 @@ func (r *Runtime) RemoveVolume(ctx context.Context, v *Volume, force bool, timeo
return define.ErrRuntimeStopped
}

if !v.valid {
if ok, _ := r.state.HasVolume(v.Name()); !ok {
// Volume probably already removed
// Or was never in the runtime to begin with
return nil
}
}
return r.removeVolume(ctx, v, force, timeout, false)
}

Expand Down
21 changes: 14 additions & 7 deletions libpod/runtime_volume_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,13 +357,11 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo
return define.ErrVolumeRemoved
}

v.lock.Lock()
defer v.lock.Unlock()

// Update volume status to pick up a potential removal from state
if err := v.update(); err != nil {
return err
}
// DANGEROUS: Do not lock here yet because we might needed to remove containers first.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking we should have a "Removing" bool in volumes that prevents further use once it's set. Would solve the container-addition-during-removal problem. But that seems like a separate PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes and the longer I think about the more I think we might similar issues around networks, secretes, etc..

// In general we must always acquire the ctr lock before a volume lock so we cannot lock.
// THIS MUST BE DONE to prevent ABBA deadlocks.
// It also means the are several races around creating containers with volumes and removing
// them in parallel. However that problem exists regadless of taking the lock here or not.

deps, err := r.state.VolumeInUse(v)
if err != nil {
Expand Down Expand Up @@ -400,6 +398,15 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo
}
}

// Ok now we are good to lock.
v.lock.Lock()
defer v.lock.Unlock()

// Update volume status to pick up a potential removal from state
if err := v.update(); err != nil {
return err
}

// If the volume is still mounted - force unmount it
if err := v.unmount(true); err != nil {
if force {
Expand Down
12 changes: 7 additions & 5 deletions test/system/030-run.bats
Original file line number Diff line number Diff line change
Expand Up @@ -1289,13 +1289,15 @@ EOF
rm -rf $romount
}

@test "podman run --restart=always -- wait" {
@test "podman run --restart=always/on-failure -- wait" {
# regression test for #18572 to make sure Podman waits less than 20 seconds
ctr=c_$(safename)
run_podman run -d --restart=always --name=$ctr $IMAGE false
PODMAN_TIMEOUT=20 run_podman wait $ctr
is "$output" "1" "container should exit 1"
run_podman rm -f -t0 $ctr
for policy in always on-failure; do
run_podman run -d --restart=$policy --name=$ctr $IMAGE false
PODMAN_TIMEOUT=20 run_podman wait $ctr
is "$output" "1" "container should exit 1 (policy: $policy)"
run_podman rm -f -t0 $ctr
done
}

@test "podman run - custom static_dir" {
Expand Down
37 changes: 28 additions & 9 deletions test/system/500-networking.bats
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,27 @@ EOF
fi
}

function wait_for_restart_count() {
local cname="$1"
local count="$2"
local tname="$3"

local timeout=10
while :; do
# Previously this would fail as the container would run out of ips after 5 restarts.
run_podman inspect --format "{{.RestartCount}}" $cname
if [[ "$output" == "$2" ]]; then
break
fi

timeout=$((timeout - 1))
if [[ $timeout -eq 0 ]]; then
die "Timed out waiting for RestartCount with $tname"
fi
sleep 0.5
done
}

# Test for https://github.com/containers/podman/issues/18615
@test "podman network cleanup --userns + --restart" {
skip_if_cgroupsv1 "run --uidmap fails on cgroups v1 (issue 15025, wontfix)"
Expand All @@ -955,10 +976,8 @@ EOF
# This will cause 7 containers runs with the restart policy (one more than the on failure limit)
# Since they run sequentially they should run fine without allocating all ips.
run_podman 1 run --name $cname --network $net1 --restart on-failure:6 --userns keep-id $IMAGE false

# Previously this would fail as the container would run out of ips after 5 restarts.
run_podman inspect --format "{{.RestartCount}}" $cname
assert "$output" == "6" "RestartCount for failing container with bridge network"
wait_for_restart_count $cname 6 "custom network"
run_podman wait $cname

# Now make sure we can still run a container with free ips.
run_podman run --rm --network $net1 $IMAGE true
Expand All @@ -969,23 +988,23 @@ EOF
if has_slirp4netns; then
cname2=con2-$(random_string 10)
run_podman 1 run --name $cname2 --network slirp4netns --restart on-failure:2 --userns keep-id $IMAGE false
run_podman inspect --format "{{.RestartCount}}" $cname2
assert "$output" == "2" "RestartCount for failing container with slirp4netns"
wait_for_restart_count $cname2 2 "slirp4netns"
run_podman wait $cname2
fi

if is_rootless; then
# pasta can only run rootless
cname3=con3-$(random_string 10)
run_podman 1 run --name $cname3 --network pasta --restart on-failure:2 --userns keep-id $IMAGE false
run_podman inspect --format "{{.RestartCount}}" $cname3
assert "$output" == "2" "RestartCount for failing container with pasta"
wait_for_restart_count $cname3 2 "pasta"
run_podman wait $cname3
else
# This is racy if other programs modify /run/netns while the test is running.
# However I think the risk is minimal and I think checking for this is important.
assert "$(ls /run/netns | wc -l)" == "$netns_count" "/run/netns has no leaked netns files"
fi

run_podman rm -f -t0 $cname $cname2 $cname3
run_podman rm $cname $cname2 $cname3
run_podman network rm $net1
}

Expand Down