Files
next.js/run-evals.js
Arian Tron 61f56f997c
Some checks failed
Test examples / Test Examples (20) (push) Has been cancelled
Test examples / Test Examples (22) (push) Has been cancelled
Lock Threads / action (push) Has been cancelled
Trigger Release / start (push) Has been cancelled
Stale issue handler / stale (push) Has been cancelled
Update Font Data / create-pull-request (push) Has been cancelled
build-and-deploy / deploy-target (push) Has been cancelled
build-and-deploy / build (push) Has been cancelled
build-and-deploy / stable - aarch64-unknown-linux-musl - node@16 (push) Has been cancelled
build-and-deploy / stable - x86_64-unknown-linux-musl - node@16 (push) Has been cancelled
build-and-deploy / stable - aarch64-unknown-linux-gnu - node@16 (push) Has been cancelled
build-and-deploy / stable - x86_64-unknown-linux-gnu - node@16 (push) Has been cancelled
build-and-deploy / stable - aarch64-pc-windows-msvc - node@16 (push) Has been cancelled
build-and-deploy / stable - x86_64-pc-windows-msvc - node@16 (push) Has been cancelled
build-and-deploy / stable - aarch64-apple-darwin - node@16 (push) Has been cancelled
build-and-deploy / stable - x86_64-apple-darwin - node@16 (push) Has been cancelled
build-and-deploy / build-wasm (nodejs) (push) Has been cancelled
build-and-deploy / build-wasm (web) (push) Has been cancelled
build-and-deploy / Deploy preview tarball (push) Has been cancelled
build-and-deploy / Potentially publish release (push) Has been cancelled
build-and-deploy / publish-turbopack-npm-packages (push) Has been cancelled
build-and-deploy / Deploy examples (push) Has been cancelled
build-and-deploy / thank you, build (push) Has been cancelled
build-and-deploy / Upload Turbopack Bytesize metrics to Datadog (push) Has been cancelled
Rspack Next.js development integration tests / Rspack integration tests (push) Has been cancelled
Rspack Next.js production integration tests / Rspack integration tests (push) Has been cancelled
Turbopack Next.js development integration tests / Next.js integration tests (push) Has been cancelled
Turbopack Next.js production integration tests / Next.js integration tests (push) Has been cancelled
Update Rspack test manifest / Update and upload Rspack development test manifest (push) Has been cancelled
Update Rspack test manifest / Update and upload Rspack production test manifest (push) Has been cancelled
Upload bundler test manifests to areweturboyet.com / Upload test results (push) Has been cancelled
Update React / create-pull-request (push) Has been cancelled
test-e2e-project-reset-cron / reset-test-project (push) Has been cancelled
Notify about the top 15 issues/PRs/feature requests (most reacted) in the last 90 days / run (push) Has been cancelled
first commit
2026-03-10 19:37:31 +03:30

194 lines
6.2 KiB
JavaScript

#!/usr/bin/env node
// @ts-check
/**
* Pack the locally-built `next` package and run agent evals against it.
*
* pnpm eval <eval-name> run one eval, both variants (baseline + AGENTS.md)
* pnpm eval <eval-name> --dry preview without executing
* pnpm eval --all run every eval (slow — normally only CI does this)
* NEXT_SKIP_PACK=1 pnpm eval ... reuse tarball from last run
*
* Mirrors run-tests.js: pack once, hand paths to child via env, forward args.
*
* We only pack `next`, not the whole workspace. The sandbox is remote Linux:
* - @next/swc: local darwin binary wouldn't run there; the sandbox downloads
* the right one at runtime (packages/next/src/build/swc/index.ts).
* - @next/env etc: resolved from npm at the pinned canary version.
*
* The experiments/ dir is generated fresh on every run and gitignored. This
* keeps the two variants (baseline vs. AGENTS.md) in one place instead of
* maintaining N committed experiment files that only differ by one line.
*/
const path = require('path')
const fs = require('fs')
const { execFileSync, spawnSync } = require('child_process')
const ROOT = __dirname
// Sandbox + agent API keys. agent-eval looks for .env.local in its own cwd
// (evals/), but `vc env pull` writes to the repo root. Load it here so the
// vars are already in process.env when we spawn the child.
try {
process.loadEnvFile(path.join(ROOT, '.env.local'))
} catch {}
const EVALS_DIR = path.join(ROOT, 'evals')
const FIXTURES_DIR = path.join(EVALS_DIR, 'evals')
const EXPERIMENTS_DIR = path.join(EVALS_DIR, 'experiments')
const TARBALL_DIR = path.join(EVALS_DIR, '.tarballs')
const TARBALL = path.join(TARBALL_DIR, 'next.tgz')
// The two variants we always compare. Order matters for output readability:
// baseline first so a contributor sees "does the agent fail without docs?"
// before "does it pass with docs?".
const VARIANTS = [
{
suffix: 'baseline',
imports: `import { installNextJs } from '../lib/setup.js'`,
setup: `await installNextJs(sandbox)`,
},
{
suffix: 'agents-md',
imports: `import { installNextJs, writeAgentsMd } from '../lib/setup.js'`,
setup: `await installNextJs(sandbox)\n await writeAgentsMd(sandbox)`,
},
]
function pack() {
fs.mkdirSync(TARBALL_DIR, { recursive: true })
const out = execFileSync(
'pnpm',
['pack', '--pack-destination', TARBALL_DIR],
{ cwd: path.join(ROOT, 'packages/next'), encoding: 'utf8' }
)
const produced = out.trim().split('\n').pop()
const src = path.isAbsolute(produced)
? produced
: path.join(TARBALL_DIR, produced)
fs.renameSync(src, TARBALL)
}
/** @param {string | null} evalName null means all evals */
function writeExperiments(evalName) {
fs.rmSync(EXPERIMENTS_DIR, { recursive: true, force: true })
fs.mkdirSync(EXPERIMENTS_DIR, { recursive: true })
const evalsField = evalName ? `\n evals: '${evalName}',` : ''
for (const v of VARIANTS) {
const body = `import type { ExperimentConfig } from '@vercel/agent-eval'
${v.imports}
const config: ExperimentConfig = {
agent: 'claude-code',
model: 'claude-opus-4-6',${evalsField}
scripts: ['build'],
runs: 1,
earlyExit: true,
timeout: 720,
sandbox: 'auto',
setup: async (sandbox) => {
${v.setup}
},
}
export default config
`
fs.writeFileSync(path.join(EXPERIMENTS_DIR, `${v.suffix}.ts`), body)
}
}
function listEvals() {
return fs
.readdirSync(FIXTURES_DIR, { withFileTypes: true })
.filter((d) => d.isDirectory())
.map((d) => d.name)
}
function main() {
const argv = require('yargs/yargs')(process.argv.slice(2))
.command(
'$0 [eval-name]',
'Run an eval (baseline + agents-md variants)',
(y) =>
y.positional('eval-name', {
type: 'string',
describe: 'Fixture directory name',
})
)
.boolean('all')
.describe('all', 'Run every eval (slow — normally only CI does this)')
.boolean('dry')
.describe('dry', 'Preview without executing')
.conflicts('all', 'eval-name')
.check((argv) => {
if (!argv.all && !argv.evalName) {
throw new Error(
`Missing <eval-name>.\n\nAvailable evals:\n${listEvals()
.map((n) => ` ${n}`)
.join('\n')}`
)
}
if (
argv.evalName &&
!fs.existsSync(path.join(FIXTURES_DIR, argv.evalName))
) {
throw new Error(
`Unknown eval: ${argv.evalName}\n(looked in ${FIXTURES_DIR})`
)
}
return true
})
.strict()
.help().argv
/** @type {string | null} */
const evalName = argv.all ? null : /** @type {string} */ (argv.evalName)
// Flags not consumed here are forwarded to agent-eval.
const forward = argv.dry ? ['--dry'] : []
if (!fs.existsSync(path.join(ROOT, 'packages/next/dist'))) {
console.error(
'packages/next/dist not found. Run `pnpm --filter=next build` first.'
)
process.exit(1)
}
if (process.env.NEXT_SKIP_PACK && fs.existsSync(TARBALL)) {
console.log('> Reusing existing tarball (NEXT_SKIP_PACK=1)')
} else {
console.log('> Packing next...')
pack()
const mb = (fs.statSync(TARBALL).size / 1024 / 1024).toFixed(1)
console.log(` ${TARBALL} (${mb} MB)`)
}
writeExperiments(evalName)
console.log(
evalName
? `> Running ${evalName} (baseline + agents-md)`
: '> Running all evals (baseline + agents-md)'
)
// Same handoff pattern as run-tests.js with NEXT_TEST_PKG_PATHS. We invoke
// the bin directly rather than via `pnpm exec` because pnpm resets cwd to
// the workspace root, but agent-eval resolves experiments/ from process.cwd().
const bin = path.join(ROOT, 'node_modules/.bin/agent-eval')
const result = spawnSync(bin, ['run-all', '--force', ...forward], {
cwd: EVALS_DIR,
stdio: 'inherit',
env: { ...process.env, NEXT_EVAL_TARBALL: TARBALL },
})
if (result.error) {
// ENOENT (missing bin), EACCES, etc. — spawnSync returns status: null
// without printing anything, so surface it.
console.error(`Failed to run ${bin}: ${result.error.message}`)
if (/** @type {NodeJS.ErrnoException} */ (result.error).code === 'ENOENT') {
console.error('Did you run `pnpm install`?')
}
process.exit(1)
}
process.exit(result.status ?? 1)
}
main()