diff --git a/apps/cli/src/cli.ts b/apps/cli/src/cli.ts index 12dece45c..995cc8c26 100644 --- a/apps/cli/src/cli.ts +++ b/apps/cli/src/cli.ts @@ -1,6 +1,32 @@ #!/usr/bin/env node +import { killAllTrackedChildren } from '@agentv/core'; + import { runCli } from './index.js'; +// Forward SIGINT/SIGTERM to spawned provider subprocesses before exiting. +// Without this, Studio's `child.kill('SIGTERM')` against the CLI orphans +// any in-flight `claude`/`codex`/`pi`/`copilot` subprocess. The partial +// `index.jsonl` is already row-by-row durable, so finished tests survive. +// +// First signal: kill children, exit with the conventional 128+signal code. +// Second signal within the same process: hard-exit so a hung child cannot +// trap the user. +let interrupted = false; +function installShutdown(signal: NodeJS.Signals, exitCode: number) { + process.on(signal, () => { + if (interrupted) { + process.exit(1); + } + interrupted = true; + killAllTrackedChildren('SIGTERM'); + // Defer exit one tick so SIGTERM has a chance to dispatch before the + // event loop tears down. + setTimeout(() => process.exit(exitCode), 50); + }); +} +installShutdown('SIGINT', 130); +installShutdown('SIGTERM', 143); + runCli() .then(() => { process.exit(0); diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index e005d786f..c0a511915 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -34,7 +34,7 @@ export function deduplicateByTestIdTarget( export async function aggregateRunDir( runDir: string, - options?: { evalFile?: string; experiment?: string }, + options?: { evalFile?: string; experiment?: string; plannedTestCount?: number }, ): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> { const indexPath = path.join(runDir, RESULT_INDEX_FILENAME); const content = await readFile(indexPath, 'utf8'); @@ -45,7 +45,18 @@ export async function aggregateRunDir( const timingPath = path.join(runDir, 'timing.json'); await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); - const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment); + // Preserve `planned_test_count` from any pre-existing benchmark.json (e.g. + // the stub written at run start, or from the original run when this is a + // resume) unless an explicit value was passed. + const plannedTestCount = + options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json'))); + + const benchmark = buildBenchmarkArtifact( + results, + options?.evalFile, + options?.experiment, + plannedTestCount, + ); const benchmarkPath = path.join(runDir, 'benchmark.json'); await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); @@ -53,6 +64,17 @@ export async function aggregateRunDir( return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size }; } +async function readPlannedTestCount(benchmarkPath: string): Promise { + try { + const raw = await readFile(benchmarkPath, 'utf8'); + const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } }; + const value = parsed.metadata?.planned_test_count; + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; + } catch { + return undefined; + } +} + // --------------------------------------------------------------------------- // Artifact interfaces (snake_case to match skill-creator conventions) // --------------------------------------------------------------------------- @@ -110,6 +132,13 @@ export interface BenchmarkArtifact { readonly targets: readonly string[]; readonly tests_run: readonly string[]; readonly experiment?: string; + /** + * Total number of test cases the run was planned to execute (across all + * targets and eval files). Written at run start so an interrupted run + * can be diagnosed as resumable when `tests_run.length < planned_test_count`, + * even if every recorded row has `execution_status: ok`. + */ + readonly planned_test_count?: number; }; readonly run_summary: Record< string, @@ -364,6 +393,7 @@ export function buildBenchmarkArtifact( results: readonly EvaluationResult[], evalFile = '', experiment?: string, + plannedTestCount?: number, ): BenchmarkArtifact { const targetSet = new Set(); const testIdSet = new Set(); @@ -457,6 +487,7 @@ export function buildBenchmarkArtifact( targets, tests_run: testIds, experiment, + planned_test_count: plannedTestCount, }, run_summary: runSummary, per_grader_summary: perEvaluatorSummary, @@ -464,6 +495,35 @@ export function buildBenchmarkArtifact( }; } +/** + * Write a stub `benchmark.json` at the start of a run, before any tests + * have executed. Carries `planned_test_count` so an interrupted run can + * still be detected as resumable even when every recorded row has + * `execution_status: ok`. + * + * The end-of-run write (writeArtifactsFromResults / aggregateRunDir) + * overwrites this file with the full summary; preserve `planned_test_count` + * by passing it through. + */ +export async function writeInitialBenchmarkArtifact( + runDir: string, + options: { + evalFile: string; + plannedTestCount: number; + experiment?: string; + }, +): Promise { + await mkdir(runDir, { recursive: true }); + const stub = buildBenchmarkArtifact( + [], + options.evalFile, + options.experiment, + options.plannedTestCount, + ); + const benchmarkPath = path.join(runDir, 'benchmark.json'); + await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8'); +} + export function buildAggregateGradingArtifact( results: readonly EvaluationResult[], ): AggregateGradingArtifact { @@ -826,7 +886,7 @@ export async function writePerTestArtifacts( export async function writeArtifactsFromResults( results: readonly EvaluationResult[], outputDir: string, - options?: { evalFile?: string; experiment?: string }, + options?: { evalFile?: string; experiment?: string; plannedTestCount?: number }, ): Promise<{ testArtifactDir: string; timingPath: string; @@ -877,8 +937,16 @@ export async function writeArtifactsFromResults( const timing = buildTimingArtifact(results); await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); - // Write benchmark - const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment); + // Write benchmark — preserve `planned_test_count` from the run-start stub + // (or from the original run when this is a resume) unless an explicit + // value was passed by the caller. + const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath)); + const benchmark = buildBenchmarkArtifact( + results, + options?.evalFile, + options?.experiment, + plannedTestCount, + ); await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); await writeJsonlFile(indexPath, indexRecords); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index db0b0fb19..0f3536061 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -38,6 +38,7 @@ import { deduplicateByTestIdTarget, parseJsonlResults, writeArtifactsFromResults, + writeInitialBenchmarkArtifact, } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; @@ -1447,6 +1448,21 @@ export async function runEvalCommand( ); } + // Write a stub benchmark.json before dispatching tests, carrying the planned + // execution count so an interrupted run can still surface as resumable in + // Studio (results.length < planned_test_count) even when every recorded row + // has execution_status: ok. The end-of-run write preserves this value via + // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults. + // Skip on resume — we want to preserve the *original* planned count. + if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) { + const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; + await writeInitialBenchmarkArtifact(runDir, { + evalFile, + plannedTestCount: totalEvalCount, + experiment: normalizeExperimentName(options.experiment), + }); + } + // Eval files run sequentially; within each file, --workers N test cases run in parallel. // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file // workspace races without any grouping complexity. diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts index e40c01c74..ab601e7f2 100644 --- a/apps/cli/src/commands/results/eval-runner.ts +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -412,6 +412,35 @@ export function registerEvalRoutes( } }); + // ── Stop a running eval ──────────────────────────────────────────────── + // POST (not DELETE) because Stop is part of the stop → resume → complete + // workflow, not a destructive cancel. The run remains resumable from the + // partial index.jsonl on disk. Idempotent: hitting /stop on a terminal + // run returns 200 with `stopped: false, reason: 'already_terminal'` + // rather than 4xx, so clients can fire-and-forget. + // + // SIGTERM the spawned CLI; the existing child.on('close') flips status + // to 'finished'/'failed'. The CLI's own signal handler walks its tracked + // grandchildren (claude/codex/pi/copilot subprocesses) and kills them + // before exiting. + app.post('/api/eval/run/:id/stop', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + const id = c.req.param('id'); + const run = activeRuns.get(id ?? ''); + if (!run) return c.json({ error: 'Run not found' }, 404); + if (run.status === 'finished' || run.status === 'failed' || !run.process) { + return c.json({ stopped: false, reason: 'already_terminal', status: run.status }); + } + try { + run.process.kill('SIGTERM'); + } catch (err) { + return c.json({ error: (err as Error).message }, 500); + } + return c.json({ stopped: true, status: run.status }); + }); + // ── Run status ───────────────────────────────────────────────────────── app.get('/api/eval/status/:id', (c) => { const id = c.req.param('id'); @@ -576,6 +605,24 @@ export function registerEvalRoutes( } }); + app.post('/api/benchmarks/:benchmarkId/eval/run/:id/stop', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + const id = c.req.param('id'); + const run = activeRuns.get(id ?? ''); + if (!run) return c.json({ error: 'Run not found' }, 404); + if (run.status === 'finished' || run.status === 'failed' || !run.process) { + return c.json({ stopped: false, reason: 'already_terminal', status: run.status }); + } + try { + run.process.kill('SIGTERM'); + } catch (err) { + return c.json({ error: (err as Error).message }, 500); + } + return c.json({ stopped: true, status: run.status }); + }); + app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => { const id = c.req.param('id'); const run = activeRuns.get(id ?? ''); diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index f88bf212e..8468bdf88 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -347,8 +347,8 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) { function deriveResumeMeta( cwd: string, manifestPath: string, -): { run_dir?: string; suite_filter?: string } { - const out: { run_dir?: string; suite_filter?: string } = {}; +): { run_dir?: string; suite_filter?: string; planned_test_count?: number } { + const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {}; const runDir = path.dirname(manifestPath); const relative = path.relative(cwd, runDir); // path.relative returns '..'-prefixed paths when runDir is outside cwd; keep @@ -359,15 +359,19 @@ function deriveResumeMeta( const benchmarkPath = path.join(runDir, 'benchmark.json'); if (existsSync(benchmarkPath)) { const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as { - metadata?: { eval_file?: string }; + metadata?: { eval_file?: string; planned_test_count?: number }; }; const evalFile = parsed.metadata?.eval_file; if (typeof evalFile === 'string' && evalFile.trim()) { out.suite_filter = evalFile.trim(); } + const planned = parsed.metadata?.planned_test_count; + if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) { + out.planned_test_count = planned; + } } } catch { - // benchmark.json missing / unreadable / malformed — leave suite_filter unset. + // benchmark.json missing / unreadable / malformed — leave fields unset. } return out; } diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 0e69e495a..30c04ee66 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1003,6 +1003,52 @@ describe('serve app', () => { }); }); + // ── POST /api/eval/run/:id/stop — interrupt a running eval ───────────── + // + // Stop is part of the stop → resume workflow, not a destructive cancel — + // POST (not DELETE) and idempotent on already-terminal runs. These tests + // validate routing/auth shape (404 unknown id, 403 read-only). The happy + // path SIGTERM behavior is covered by manual UAT because it requires a + // live subprocess that is reliably mid-run; unit tests that race a launch + // against a stop are flaky. + + describe('POST /api/eval/run/:id/stop (stop API)', () => { + function makeAppForStop(opts?: { readOnly?: boolean }) { + return createApp([], tempDir, undefined, undefined, { + studioDir, + readOnly: opts?.readOnly === true, + }); + } + + it('returns 404 for an unknown run id', async () => { + const app = makeAppForStop(); + const res = await app.request('/api/eval/run/no-such-id/stop', { method: 'POST' }); + expect(res.status).toBe(404); + }); + + it('returns 403 in read-only mode', async () => { + const app = makeAppForStop({ readOnly: true }); + const res = await app.request('/api/eval/run/anything/stop', { method: 'POST' }); + expect(res.status).toBe(403); + }); + + it('returns 404 for benchmark-scoped stop with unknown run id', async () => { + const app = makeAppForStop(); + const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id/stop', { + method: 'POST', + }); + expect(res.status).toBe(404); + }); + + it('returns 403 in read-only mode for benchmark-scoped stop', async () => { + const app = makeAppForStop({ readOnly: true }); + const res = await app.request('/api/benchmarks/some-id/eval/run/anything/stop', { + method: 'POST', + }); + expect(res.status).toBe(403); + }); + }); + // ── POST /api/eval/preview — argument shaping for resume flags ───────── // // /api/eval/preview is a lightweight endpoint that returns the CLI diff --git a/apps/studio/src/components/ResumeRunActions.tsx b/apps/studio/src/components/ResumeRunActions.tsx index d161c9d22..f50ebe5b5 100644 --- a/apps/studio/src/components/ResumeRunActions.tsx +++ b/apps/studio/src/components/ResumeRunActions.tsx @@ -35,6 +35,7 @@ export interface ResumeRunActionsProps { target?: string; benchmarkId?: string; isReadOnly: boolean; + plannedTestCount?: number; } export function ResumeRunActions({ @@ -44,12 +45,13 @@ export function ResumeRunActions({ target, benchmarkId, isReadOnly, + plannedTestCount, }: ResumeRunActionsProps) { const navigate = useNavigate(); const [busy, setBusy] = useState(null); const [error, setError] = useState(null); - if (!shouldShowResumeActions(results, isReadOnly)) return null; + if (!shouldShowResumeActions(results, isReadOnly, plannedTestCount)) return null; // Both actions need the run dir + the original eval file. Without those // we can't target the existing run workspace, so we render the buttons diff --git a/apps/studio/src/components/StopRunButton.tsx b/apps/studio/src/components/StopRunButton.tsx new file mode 100644 index 000000000..d0eb65e2b --- /dev/null +++ b/apps/studio/src/components/StopRunButton.tsx @@ -0,0 +1,62 @@ +/** + * StopRunButton — pause-style affordance on /jobs/:runId that interrupts + * a Studio-launched eval. Stop is part of the stop → resume → complete + * workflow, not a destructive cancel: the partial index.jsonl is + * preserved and can be resumed in one click from the run-detail page. + * + * Calls POST /api/eval/run/:id/stop (or the benchmark-scoped variant). + * Optimistically flips the local label to "Stopping…" until the next + * poll of /api/eval/status/:id observes a terminal state — at which + * point the button hides via `shouldShowStopButton`. + * + * Styling is intentionally neutral (gray, not red) to signal that this + * is a pause, not a kill. + */ + +import { useState } from 'react'; + +import { stopEvalRun } from '~/lib/api'; + +import { type RunStatus, shouldShowStopButton } from './stop-run-helpers'; + +export interface StopRunButtonProps { + runId: string; + status: RunStatus | undefined; + isReadOnly: boolean; + benchmarkId?: string; +} + +export function StopRunButton({ runId, status, isReadOnly, benchmarkId }: StopRunButtonProps) { + const [stopping, setStopping] = useState(false); + const [error, setError] = useState(null); + + if (!shouldShowStopButton(status, isReadOnly)) return null; + + async function onClick() { + setStopping(true); + setError(null); + try { + await stopEvalRun(runId, benchmarkId); + } catch (err) { + setError(err instanceof Error ? err.message : 'Failed to stop run'); + setStopping(false); + } + // On success, leave `stopping=true`. The status poller will flip to + // a terminal state shortly, at which point the button unmounts. + } + + return ( +
+ + {error &&

{error}

} +
+ ); +} diff --git a/apps/studio/src/components/resume-run-helpers.test.ts b/apps/studio/src/components/resume-run-helpers.test.ts index 39e2d807c..9ceb71870 100644 --- a/apps/studio/src/components/resume-run-helpers.test.ts +++ b/apps/studio/src/components/resume-run-helpers.test.ts @@ -32,6 +32,23 @@ describe('shouldShowResumeActions', () => { it('hides on empty results', () => { expect(shouldShowResumeActions([], false)).toBe(false); }); + + it('shows for an incomplete partial run with only ok rows when planned_test_count exceeds results', () => { + // Stop button / Ctrl+C scenario: 5 of 10 planned tests finished + // successfully before the run was killed. No execution errors, but + // still resumable. + const results = [ok('a'), ok('b'), ok('c'), ok('d'), ok('e')]; + expect(shouldShowResumeActions(results, false, 10)).toBe(true); + }); + + it('hides when results match planned_test_count (complete passing run)', () => { + const results = [ok('a'), ok('b'), ok('c')]; + expect(shouldShowResumeActions(results, false, 3)).toBe(false); + }); + + it('hides incomplete partial run in read-only mode', () => { + expect(shouldShowResumeActions([ok('a')], true, 5)).toBe(false); + }); }); describe('buildResumeRequestBody', () => { diff --git a/apps/studio/src/components/resume-run-helpers.ts b/apps/studio/src/components/resume-run-helpers.ts index fcbf805a0..b8e6451f5 100644 --- a/apps/studio/src/components/resume-run-helpers.ts +++ b/apps/studio/src/components/resume-run-helpers.ts @@ -21,14 +21,29 @@ export interface BuildResumeRequestParams { } /** - * Whether the resume actions should be visible. The button only makes sense - * when at least one row failed with an execution error and the user has - * write access (read-only mode hides the entire control rather than - * showing a disabled button — see issue acceptance criteria). + * Whether the resume actions should be visible. The button is shown when: + * 1. At least one recorded row has `execution_status: execution_error`, OR + * 2. The run is *incomplete* — fewer recorded rows than the originally + * planned execution count, even if every recorded row is `ok`. + * + * Case 2 covers Stop-button / Ctrl+C interruptions where the run produced + * only successful rows before being killed: there is no `execution_error` + * to anchor on, but the run is still resumable. `plannedTestCount` is + * persisted in `benchmark.json.metadata` at run start (see + * `writeInitialBenchmarkArtifact`). + * + * Hidden in read-only mode — the server also returns 403, but UI-level + * hiding avoids dead controls. */ -export function shouldShowResumeActions(results: EvalResult[], isReadOnly: boolean): boolean { +export function shouldShowResumeActions( + results: EvalResult[], + isReadOnly: boolean, + plannedTestCount?: number, +): boolean { if (isReadOnly) return false; - return results.some((r) => r.executionStatus === 'execution_error'); + if (results.some((r) => r.executionStatus === 'execution_error')) return true; + if (plannedTestCount !== undefined && results.length < plannedTestCount) return true; + return false; } /** diff --git a/apps/studio/src/components/stop-run-helpers.test.ts b/apps/studio/src/components/stop-run-helpers.test.ts new file mode 100644 index 000000000..3bd407b1b --- /dev/null +++ b/apps/studio/src/components/stop-run-helpers.test.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from 'bun:test'; + +import { isTerminalRunStatus, shouldShowStopButton } from './stop-run-helpers'; + +describe('isTerminalRunStatus', () => { + it('treats finished and failed as terminal', () => { + expect(isTerminalRunStatus('finished')).toBe(true); + expect(isTerminalRunStatus('failed')).toBe(true); + }); + + it('treats live states and unknowns as non-terminal', () => { + expect(isTerminalRunStatus('starting')).toBe(false); + expect(isTerminalRunStatus('running')).toBe(false); + expect(isTerminalRunStatus(undefined)).toBe(false); + }); +}); + +describe('shouldShowStopButton', () => { + it('shows while the run is live', () => { + expect(shouldShowStopButton('starting', false)).toBe(true); + expect(shouldShowStopButton('running', false)).toBe(true); + }); + + it('hides once the run reaches a terminal state', () => { + expect(shouldShowStopButton('finished', false)).toBe(false); + expect(shouldShowStopButton('failed', false)).toBe(false); + }); + + it('hides in read-only mode regardless of status', () => { + expect(shouldShowStopButton('running', true)).toBe(false); + }); + + it('hides when the status is undefined', () => { + expect(shouldShowStopButton(undefined, false)).toBe(false); + }); +}); diff --git a/apps/studio/src/components/stop-run-helpers.ts b/apps/studio/src/components/stop-run-helpers.ts new file mode 100644 index 000000000..fcef0e56a --- /dev/null +++ b/apps/studio/src/components/stop-run-helpers.ts @@ -0,0 +1,28 @@ +/** + * Pure helpers backing StopRunButton, isolated for unit testing. + * + * Intentionally side-effect-free so the visibility matrix is testable + * without rendering React. + * + * To extend: extend the union of statuses recognized as non-terminal as + * the server adds new lifecycle states. Today the server only emits + * starting / running / finished / failed; anything not in the terminal + * set is treated as live. + */ + +export type RunStatus = 'starting' | 'running' | 'finished' | 'failed' | (string & {}); + +export function isTerminalRunStatus(status: RunStatus | undefined): boolean { + return status === 'finished' || status === 'failed'; +} + +/** + * Whether the Stop button should be visible. Hidden when the run is + * terminal (no process to kill) and in read-only mode (the API also + * 403s, but UI-level hiding avoids dead controls). + */ +export function shouldShowStopButton(status: RunStatus | undefined, isReadOnly: boolean): boolean { + if (isReadOnly) return false; + if (!status) return false; + return !isTerminalRunStatus(status); +} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 1ea719e6e..eb4b95010 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -539,6 +539,21 @@ export async function launchEvalRun( return res.json() as Promise; } +export async function stopEvalRun( + runId: string, + benchmarkId?: string, +): Promise<{ stopped: boolean; reason?: string; status?: string }> { + const url = benchmarkId + ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}/stop` + : `/api/eval/run/${runId}/stop`; + const res = await fetch(url, { method: 'POST' }); + if (!res.ok) { + const err = await res.json().catch(() => ({ error: res.statusText })); + throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`); + } + return res.json() as Promise<{ stopped: boolean; reason?: string; status?: string }>; +} + export function evalRunStatusOptions(runId: string | null) { return queryOptions({ queryKey: ['eval-status', runId], diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 46f321d32..6ef4dc1b4 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -80,6 +80,8 @@ export interface RunDetailResponse { run_dir?: string; /** Eval file path the run was launched against, if recorded in benchmark.json. Local runs only. */ suite_filter?: string; + /** Total (test_id, target) executions originally planned for this run. Used to detect incomplete partial runs as resumable. Local runs only, populated when the run was launched after the planned-count metadata feature shipped. */ + planned_test_count?: number; } export interface SuiteSummary { diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx index 169d6d643..2bf2b9ea6 100644 --- a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx @@ -64,7 +64,9 @@ function BenchmarkEvalDetailPage() { Run: {runId} / Eval: {evalId}

- + {passed ? '✓' : '✗'} {evalId} diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx index db0f28bd4..dfb41c244 100644 --- a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx +++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx @@ -77,6 +77,7 @@ function BenchmarkRunDetailPage() { target={target ?? undefined} benchmarkId={benchmarkId} isReadOnly={isReadOnly} + plannedTestCount={data?.planned_test_count} /> {!isReadOnly && (