diff --git a/act/container/host_environment.go b/act/container/host_environment.go index 023e3fdc..23385e7a 100644 --- a/act/container/host_environment.go +++ b/act/container/host_environment.go @@ -429,6 +429,24 @@ func (e *HostEnvironment) UpdateFromEnv(srcPath string, env *map[string]string) return parseEnvFile(e, srcPath, env) } +// removeAll is the filesystem delete used by removeAllWithContext. A package +// var so tests can substitute a blocking stub without patching os.RemoveAll. +var removeAll = os.RemoveAll + +// removeAllWithContext runs removeAll in a goroutine and returns once it +// finishes or ctx is cancelled. On cancellation the goroutine is left running — +// a delete blocked inside a syscall cannot be interrupted (see runWithTimeout). +func removeAllWithContext(ctx context.Context, path string) error { + done := make(chan error, 1) + go func() { done <- removeAll(path) }() + select { + case err := <-done: + return err + case <-ctx.Done(): + return ctx.Err() + } +} + func removePathWithRetry(ctx context.Context, path string) error { if path == "" { return nil @@ -448,10 +466,13 @@ func removePathWithRetry(ctx context.Context, path string) error { case <-time.After(delay): } } - lastErr = os.RemoveAll(path) + lastErr = removeAllWithContext(ctx, path) if lastErr == nil { return nil } + if errors.Is(lastErr, context.DeadlineExceeded) { + return lastErr + } } return lastErr } @@ -533,23 +554,61 @@ func (e *HostEnvironment) terminateRunningProcesses(ctx context.Context) { } } +// hostCleanupTimeout bounds each filesystem-teardown phase of the host +// environment so a single stalled delete cannot wedge the runner slot forever. +// A var (not const) so tests can shrink it. +var hostCleanupTimeout = 30 * time.Second + +// runWithTimeout runs fn in a goroutine and returns once it finishes or timeout +// elapses, whichever comes first. On timeout the goroutine is left running — an +// os.RemoveAll blocked inside a delete syscall (AV/EDR filter drivers, an +// unresponsive network mount, a dying disk) cannot be interrupted — and +// context.DeadlineExceeded is returned. Leaking the goroutine and the scratch +// state it was deleting is strictly better than blocking the caller forever and +// permanently losing the runner's capacity slot; the leaked scratch dir is +// reclaimed later by the runner's idle stale-dir sweep. +func runWithTimeout(fn func(), timeout time.Duration) error { + done := make(chan struct{}) + go func() { + defer close(done) + fn() + }() + timer := time.NewTimer(timeout) + defer timer.Stop() + select { + case <-done: + return nil + case <-timer.C: + return context.DeadlineExceeded + } +} + func (e *HostEnvironment) Remove() common.Executor { return func(ctx context.Context) error { + logger := common.Logger(ctx) + // Ensure any lingering child processes are ended before attempting // to remove the workspace (Windows file locks otherwise prevent cleanup). e.terminateRunningProcesses(ctx) // Only removes per-job misc state. Must not remove the cache/toolcache root. + // Bound it: CleanUp is a caller-supplied, typically unbounded os.RemoveAll, + // and a delete stalled by a filesystem filter driver would otherwise hang + // the job forever at "Cleaning up container" and hold the capacity slot. if e.CleanUp != nil { - e.CleanUp() + logger.Debugf("running host environment cleanup callback") + if err := runWithTimeout(e.CleanUp, hostCleanupTimeout); err != nil { + logger.Warnf("host environment cleanup did not finish within %s; continuing job completion, scratch state may be leaked and is reclaimed by the idle stale-dir sweep", hostCleanupTimeout) + } else { + logger.Debugf("host environment cleanup callback finished") + } } // Detach: a cancelled ctx would skip removePathWithRetry's retries, // which absorb Windows file-handle release lag after the kill above. - rmCtx, rmCancel := context.WithTimeout(context.Background(), 30*time.Second) + rmCtx, rmCancel := context.WithTimeout(context.Background(), hostCleanupTimeout) defer rmCancel() - logger := common.Logger(ctx) var errs []error if err := removePathWithRetry(rmCtx, e.Path); err != nil { logger.Warnf("failed to remove host misc state %s: %v", e.Path, err) @@ -561,7 +620,14 @@ func (e *HostEnvironment) Remove() common.Executor { errs = append(errs, err) } } - return errors.Join(errs...) + for _, err := range errs { + if !errors.Is(err, context.DeadlineExceeded) { + return errors.Join(errs...) + } + } + // Bounded teardown timed out; warnings already logged above. Do not + // fail job completion — leaked scratch is reclaimed by the idle sweep. + return nil } } diff --git a/act/container/host_environment_test.go b/act/container/host_environment_test.go index 983cffd6..2f5d1c5e 100644 --- a/act/container/host_environment_test.go +++ b/act/container/host_environment_test.go @@ -15,6 +15,7 @@ import ( "runtime" "strings" "testing" + "time" "gitea.com/gitea/runner/act/common" @@ -188,6 +189,118 @@ func TestHostEnvironmentRemoveCleansWorkdirWhenOwned(t *testing.T) { assert.ErrorIs(t, err, os.ErrNotExist) } +func TestRemoveAllWithContextDoesNotHangOnStuckDelete(t *testing.T) { + release := make(chan struct{}) + stubDone := make(chan struct{}) + + orig := removeAll + removeAll = func(string) error { + defer close(stubDone) + <-release + return nil + } + // removeAllWithContext intentionally leaks the delete goroutine on timeout, + // and that goroutine still references removeAll. Unblock it and wait for it + // to return before restoring the var, so the restore can't race the read. + t.Cleanup(func() { + close(release) + <-stubDone + removeAll = orig + }) + + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + + err := removeAllWithContext(ctx, t.TempDir()) + require.ErrorIs(t, err, context.DeadlineExceeded) +} + +// TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp guards against a stalled +// CleanUp callback (e.g. an os.RemoveAll blocked by an AV/EDR filter driver or +// an unresponsive mount) wedging the runner slot forever at "Cleaning up +// container". Remove must time out the callback and complete job teardown. +func TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp(t *testing.T) { + // Keep the suite fast: shrink the per-phase teardown timeout for this test. + orig := hostCleanupTimeout + hostCleanupTimeout = 100 * time.Millisecond + t.Cleanup(func() { hostCleanupTimeout = orig }) + + logger := logrus.New() + ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger)) + base := t.TempDir() + path := filepath.Join(base, "misc", "hostexecutor") + require.NoError(t, os.MkdirAll(path, 0o700)) + + release := make(chan struct{}) + t.Cleanup(func() { close(release) }) // unblock the leaked goroutine at test end + + e := &HostEnvironment{ + Path: path, + CleanUp: func() { + <-release // simulate a delete syscall stuck indefinitely + }, + StdOut: os.Stdout, + } + + done := make(chan error, 1) + go func() { done <- e.Remove()(ctx) }() + + select { + case err := <-done: + require.NoError(t, err) + case <-time.After(10 * time.Second): + t.Fatal("Remove() hung on a stuck CleanUp callback") + } +} + +// TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval guards against a +// stalled os.RemoveAll on the misc/workspace paths (same AV/EDR wedge as +// #1023) wedging job completion after the CleanUp callback has already timed +// out or finished. +func TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval(t *testing.T) { + origTimeout := hostCleanupTimeout + hostCleanupTimeout = 100 * time.Millisecond + t.Cleanup(func() { hostCleanupTimeout = origTimeout }) + + release := make(chan struct{}) + stubDone := make(chan struct{}) + + origRemoveAll := removeAll + removeAll = func(string) error { + defer close(stubDone) + <-release + return nil + } + // The stuck delete goroutine outlives the timed-out Remove and still reads + // removeAll; unblock it and wait before restoring to avoid a restore/read race. + t.Cleanup(func() { + close(release) + <-stubDone + removeAll = origRemoveAll + }) + + logger := logrus.New() + ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger)) + base := t.TempDir() + path := filepath.Join(base, "misc", "hostexecutor") + require.NoError(t, os.MkdirAll(path, 0o700)) + + e := &HostEnvironment{ + Path: path, + StdOut: os.Stdout, + } + + done := make(chan error, 1) + go func() { done <- e.Remove()(ctx) }() + + select { + case err := <-done: + require.NoError(t, err) + case <-time.After(10 * time.Second): + t.Fatal("Remove() hung on a stuck path removal") + } +} + func TestBuildWindowsWorkspaceKillScript(t *testing.T) { t.Run("single dir", func(t *testing.T) { s := buildWindowsWorkspaceKillScript([]string{`C:\workspace\job1`}) diff --git a/internal/app/run/runner.go b/internal/app/run/runner.go index 69e8c2cc..907ebf8d 100644 --- a/internal/app/run/runner.go +++ b/internal/app/run/runner.go @@ -127,15 +127,22 @@ func (r *Runner) OnIdle(ctx context.Context) { if !r.shouldRunIdleCleanup() { return } - workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/") - workdirRoot := filepath.FromSlash("/" + workdirParent) - r.cleanupStaleTaskDirs(ctx, workdirRoot) + // Bind-workdir mode: reclaim stale per-task workspace dirs (numeric task IDs). + if r.cfg.Container.BindWorkdir { + workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/") + workdirRoot := filepath.FromSlash("/" + workdirParent) + r.cleanupStaleDirs(ctx, workdirRoot, isTaskIDDir) + } + // Host mode: reclaim per-job scratch dirs left behind when HostEnvironment + // cleanup timed out (e.g. a delete stalled by an AV/EDR filter driver). They + // sit under the host workdir parent alongside the shared tool_cache, which + // the name match leaves untouched. No-op when no host-mode job ever ran. + if hostRoot := filepath.FromSlash(r.cfg.Host.WorkdirParent); hostRoot != "" { + r.cleanupStaleDirs(ctx, hostRoot, isHostScratchDir) + } } func (r *Runner) shouldRunIdleCleanup() bool { - if !r.cfg.Container.BindWorkdir { - return false - } if r.cfg.Runner.WorkdirCleanupAge <= 0 || r.cfg.Runner.IdleCleanupInterval <= 0 { return false } @@ -155,18 +162,52 @@ func (r *Runner) shouldRunIdleCleanup() bool { } } +// cleanupStaleTaskDirs reclaims stale bind-workdir per-task directories under +// workdirRoot. Retained as a thin wrapper so existing callers and tests keep a +// stable entry point. func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) { - entries, err := os.ReadDir(workdirRoot) + r.cleanupStaleDirs(ctx, workdirRoot, isTaskIDDir) +} + +// isTaskIDDir reports whether name is a per-task workspace dir (numeric task +// ID). Any other directory is skipped to avoid deleting operator-managed data +// under workdir_root. +func isTaskIDDir(name string) bool { + _, err := strconv.ParseUint(name, 10, 64) + return err == nil +} + +// isHostScratchDir reports whether name is a per-job host-mode scratch dir: +// hex.EncodeToString of 8 random bytes, i.e. exactly 16 lowercase hex chars +// (see startHostEnvironment in act/runner/run_context.go). The narrow match +// leaves the sibling shared "tool_cache" dir and any operator data untouched. +func isHostScratchDir(name string) bool { + if len(name) != 16 { + return false + } + for _, c := range name { + if (c < '0' || c > '9') && (c < 'a' || c > 'f') { + return false + } + } + return true +} + +// cleanupStaleDirs removes immediate child directories of root that match and +// whose mtime is older than WorkdirCleanupAge. It is a no-op when root does not +// exist yet (the runner has never written there). +func (r *Runner) cleanupStaleDirs(ctx context.Context, root string, match func(name string) bool) { + entries, err := os.ReadDir(root) if err != nil { if errors.Is(err, os.ErrNotExist) { return } - log.Warnf("failed to list task workspace root %s for stale cleanup: %v", workdirRoot, err) + log.Warnf("failed to list directory %s for stale cleanup: %v", root, err) return } // A task may begin between shouldRunIdleCleanup's running-count check and - // the loop below. That is safe because new task dirs are created with the + // the loop below. That is safe because new dirs are created with the // current mtime and therefore fall on the keep side of cutoff. cutoff := r.now().Add(-r.cfg.Runner.WorkdirCleanupAge) for _, entry := range entries { @@ -176,25 +217,23 @@ func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) { if !entry.IsDir() { continue } - // Task workspaces are indexed by numeric task IDs; skip any other - // directories to avoid deleting operator-managed data under workdir_root. - if _, err := strconv.ParseUint(entry.Name(), 10, 64); err != nil { + if !match(entry.Name()) { continue } info, err := entry.Info() if err != nil { - log.Warnf("failed to stat task workspace %s: %v", filepath.Join(workdirRoot, entry.Name()), err) + log.Warnf("failed to stat %s: %v", filepath.Join(root, entry.Name()), err) continue } if info.ModTime().After(cutoff) { continue } - taskDir := filepath.Join(workdirRoot, entry.Name()) - if err := os.RemoveAll(taskDir); err != nil { - log.Warnf("failed to clean stale task workspace %s: %v", taskDir, err) + dir := filepath.Join(root, entry.Name()) + if err := os.RemoveAll(dir); err != nil { + log.Warnf("failed to clean stale directory %s: %v", dir, err) continue } - log.Infof("cleaned stale task workspace %s", taskDir) + log.Infof("cleaned stale directory %s", dir) } } diff --git a/internal/app/run/runner_idle_cleanup_test.go b/internal/app/run/runner_idle_cleanup_test.go index 8f1dce47..5235ede8 100644 --- a/internal/app/run/runner_idle_cleanup_test.go +++ b/internal/app/run/runner_idle_cleanup_test.go @@ -52,6 +52,55 @@ func TestRunnerCleanupStaleTaskDirs(t *testing.T) { assert.DirExists(t, alphaNumericTask) } +// TestRunnerOnIdleCleansStaleHostScratchDirs covers the host-mode leak path: +// a per-job scratch dir (16 hex chars) left behind by a timed-out cleanup must +// be reclaimed, while the shared tool_cache and operator data are preserved. +func TestRunnerOnIdleCleansStaleHostScratchDirs(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) + hostRoot := filepath.Join(t.TempDir(), "act") + require.NoError(t, os.MkdirAll(hostRoot, 0o700)) + + staleScratch := filepath.Join(hostRoot, "0123456789abcdef") // 16 hex + freshScratch := filepath.Join(hostRoot, "fedcba9876543210") + toolCache := filepath.Join(hostRoot, "tool_cache") + operatorData := filepath.Join(hostRoot, "keep-me") + for _, path := range []string{staleScratch, freshScratch, toolCache, operatorData} { + require.NoError(t, os.MkdirAll(path, 0o700)) + } + require.NoError(t, os.Chtimes(staleScratch, now.Add(-48*time.Hour), now.Add(-48*time.Hour))) + require.NoError(t, os.Chtimes(freshScratch, now.Add(-10*time.Minute), now.Add(-10*time.Minute))) + require.NoError(t, os.Chtimes(toolCache, now.Add(-72*time.Hour), now.Add(-72*time.Hour))) + require.NoError(t, os.Chtimes(operatorData, now.Add(-72*time.Hour), now.Add(-72*time.Hour))) + + r := &Runner{ + cfg: &config.Config{ + Host: config.Host{WorkdirParent: hostRoot}, + Runner: config.Runner{ + WorkdirCleanupAge: 24 * time.Hour, + IdleCleanupInterval: time.Minute, + }, + }, + now: func() time.Time { return now }, + } + + r.OnIdle(context.Background()) + + assert.NoDirExists(t, staleScratch) // stale scratch reclaimed + assert.DirExists(t, freshScratch) // within cleanup age, kept + assert.DirExists(t, toolCache) // shared cache, never a scratch match + assert.DirExists(t, operatorData) // non-hex name, untouched +} + +func TestIsHostScratchDir(t *testing.T) { + assert.True(t, isHostScratchDir("0123456789abcdef")) + assert.True(t, isHostScratchDir("ffffffffffffffff")) + assert.False(t, isHostScratchDir("tool_cache")) + assert.False(t, isHostScratchDir("0123456789ABCDEF")) // hex.EncodeToString is lowercase + assert.False(t, isHostScratchDir("0123456789abcde")) // 15 chars + assert.False(t, isHostScratchDir("0123456789abcdef0")) // 17 chars + assert.False(t, isHostScratchDir("123")) +} + func TestRunnerCleanupStaleTaskDirsMissingRoot(t *testing.T) { r := &Runner{ cfg: &config.Config{ @@ -135,7 +184,10 @@ func TestRunnerShouldRunIdleCleanupSkipsWhenJobRunning(t *testing.T) { assert.False(t, r.shouldRunIdleCleanup()) } -func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) { +// Idle cleanup runs regardless of bind_workdir: host mode (bind_workdir off) +// still leaves per-job scratch dirs that the sweep must reclaim. +func TestRunnerShouldRunIdleCleanupRunsWithoutBindWorkdir(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) r := &Runner{ cfg: &config.Config{ Runner: config.Runner{ @@ -143,10 +195,10 @@ func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) { IdleCleanupInterval: time.Minute, }, }, - now: time.Now, + now: func() time.Time { return now }, } - assert.False(t, r.shouldRunIdleCleanup()) + assert.True(t, r.shouldRunIdleCleanup()) } func TestRunnerShouldRunIdleCleanupSkipsWhenDisabled(t *testing.T) { diff --git a/internal/pkg/config/config.example.yaml b/internal/pkg/config/config.example.yaml index 7fb081f9..0459dd35 100644 --- a/internal/pkg/config/config.example.yaml +++ b/internal/pkg/config/config.example.yaml @@ -40,11 +40,12 @@ runner: # The runner uses exponential backoff when idle, increasing the interval up to this maximum. # Set to 0 or same as fetch_interval to disable backoff. fetch_interval_max: 5s - # While idle, remove stale bind-workdir task directories older than this duration. - # Setting either workdir_cleanup_age or idle_cleanup_interval to 0 (or any - # non-positive value) disables workdir cleanup entirely. + # While idle, remove stale bind-workdir task directories and orphaned host-mode + # scratch directories (left behind when a host cleanup delete stalls) older than + # this duration. Setting either workdir_cleanup_age or idle_cleanup_interval to 0 + # (or any non-positive value) disables stale-directory cleanup entirely. workdir_cleanup_age: 24h - # Cadence for the idle stale bind-workdir cleanup pass. + # Cadence for the idle stale-directory cleanup pass. idle_cleanup_interval: 10m # The base interval for periodic log flush to the Gitea instance. # Logs may be sent earlier if the buffer reaches log_report_batch_size diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index f620c0e9..23dd5e49 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -33,8 +33,8 @@ type Runner struct { FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources. FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources. FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle. - WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories older than this duration during idle cleanup. - IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale bind-workdir cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence. + WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories and orphaned host-mode scratch dirs older than this duration during idle cleanup. + IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale-directory cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence. LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush. LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent. LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size.