mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-06-11 04:31:31 +00:00
fix: clean up job network and container when container start fails (#986)
The teardown that removes a job's per-job network and container runs as a `Finally` on the step pipeline in `newJobExecutor`, which only executes after a successful start. When the start itself fails (e.g. a `docker cp` error from a buggy daemon), that `Finally` is skipped, so the network and container leak until Docker's address pool is exhausted and later jobs can no longer create networks. This tears them down in `startContainer` when the start returns an error, reusing the existing `cleanUpJobContainer` teardown. Exposed by the daemon regression in https://gitea.com/gitea/runner/issues/981, where every failed `docker cp` leaked a per-job network. --- This PR was written with the help of Claude Opus 4.7 Reviewed-on: https://gitea.com/gitea/runner/pulls/986 Reviewed-by: Nicolas <bircni@icloud.com> Co-authored-by: silverwind <me@silverwind.io> Co-committed-by: silverwind <me@silverwind.io>
This commit is contained in:
@@ -601,10 +601,34 @@ func (rc *RunContext) interpolateOutputs() common.Executor {
|
|||||||
|
|
||||||
func (rc *RunContext) startContainer() common.Executor {
|
func (rc *RunContext) startContainer() common.Executor {
|
||||||
return func(ctx context.Context) error {
|
return func(ctx context.Context) error {
|
||||||
|
var err error
|
||||||
if rc.IsHostEnv(ctx) {
|
if rc.IsHostEnv(ctx) {
|
||||||
return rc.startHostEnvironment()(ctx)
|
err = rc.startHostEnvironment()(ctx)
|
||||||
|
} else {
|
||||||
|
err = rc.startJobContainer()(ctx)
|
||||||
}
|
}
|
||||||
return rc.startJobContainer()(ctx)
|
if err != nil {
|
||||||
|
// The job executor's teardown only runs after a successful start, so a failed
|
||||||
|
// start would otherwise leak the per-job network and container.
|
||||||
|
rc.cleanupFailedStart(ctx)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rc *RunContext) cleanupFailedStart(ctx context.Context) {
|
||||||
|
if rc.cleanUpJobContainer == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cleanCtx := ctx
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
// the start likely failed because ctx was cancelled, detach so teardown still runs
|
||||||
|
var cancel context.CancelFunc
|
||||||
|
cleanCtx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
}
|
||||||
|
if err := rc.cleanUpJobContainer(cleanCtx); err != nil {
|
||||||
|
common.Logger(ctx).Errorf("Error while cleaning up after failed container start for job %s: %v", rc.JobName, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import (
|
|||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
assert "github.com/stretchr/testify/assert"
|
assert "github.com/stretchr/testify/assert"
|
||||||
|
require "github.com/stretchr/testify/require"
|
||||||
yaml "go.yaml.in/yaml/v4"
|
yaml "go.yaml.in/yaml/v4"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -659,3 +660,53 @@ func TestPrintStartJobContainerGroupGolden(t *testing.T) {
|
|||||||
}, "\n")
|
}, "\n")
|
||||||
assert.Equal(t, want, buf.String())
|
assert.Equal(t, want, buf.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunContext_cleanupFailedStart(t *testing.T) {
|
||||||
|
type ctxKey string
|
||||||
|
const sentinel = ctxKey("sentinel")
|
||||||
|
|
||||||
|
// the fresh context is cancelled via defer on return, so capture state inside the stub
|
||||||
|
type capture struct {
|
||||||
|
calls int
|
||||||
|
err error
|
||||||
|
sentinel any
|
||||||
|
}
|
||||||
|
newRC := func(c *capture) *RunContext {
|
||||||
|
return &RunContext{
|
||||||
|
JobName: "job",
|
||||||
|
cleanUpJobContainer: func(ctx context.Context) error {
|
||||||
|
c.calls++
|
||||||
|
c.err = ctx.Err()
|
||||||
|
c.sentinel = ctx.Value(sentinel)
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("runs teardown on the live context", func(t *testing.T) {
|
||||||
|
var c capture
|
||||||
|
ctx := context.WithValue(context.Background(), sentinel, "v")
|
||||||
|
|
||||||
|
newRC(&c).cleanupFailedStart(ctx)
|
||||||
|
|
||||||
|
assert.Equal(t, 1, c.calls)
|
||||||
|
require.NoError(t, c.err)
|
||||||
|
assert.Equal(t, "v", c.sentinel)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("falls back to a fresh context when the input is done", func(t *testing.T) {
|
||||||
|
var c capture
|
||||||
|
ctx, cancel := context.WithCancel(context.WithValue(context.Background(), sentinel, "v"))
|
||||||
|
cancel()
|
||||||
|
|
||||||
|
newRC(&c).cleanupFailedStart(ctx)
|
||||||
|
|
||||||
|
assert.Equal(t, 1, c.calls)
|
||||||
|
require.NoError(t, c.err)
|
||||||
|
assert.Nil(t, c.sentinel)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("no-op when there is nothing to clean up", func(t *testing.T) {
|
||||||
|
assert.NotPanics(t, func() { (&RunContext{}).cleanupFailedStart(context.Background()) })
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user