diff --git a/apps/cli/src/cli.ts b/apps/cli/src/cli.ts
index 12dece45c..995cc8c26 100644
--- a/apps/cli/src/cli.ts
+++ b/apps/cli/src/cli.ts
@@ -1,6 +1,32 @@
#!/usr/bin/env node
+import { killAllTrackedChildren } from '@agentv/core';
+
import { runCli } from './index.js';
+// Forward SIGINT/SIGTERM to spawned provider subprocesses before exiting.
+// Without this, Studio's `child.kill('SIGTERM')` against the CLI orphans
+// any in-flight `claude`/`codex`/`pi`/`copilot` subprocess. The partial
+// `index.jsonl` is already row-by-row durable, so finished tests survive.
+//
+// First signal: kill children, exit with the conventional 128+signal code.
+// Second signal within the same process: hard-exit so a hung child cannot
+// trap the user.
+let interrupted = false;
+function installShutdown(signal: NodeJS.Signals, exitCode: number) {
+ process.on(signal, () => {
+ if (interrupted) {
+ process.exit(1);
+ }
+ interrupted = true;
+ killAllTrackedChildren('SIGTERM');
+ // Defer exit one tick so SIGTERM has a chance to dispatch before the
+ // event loop tears down.
+ setTimeout(() => process.exit(exitCode), 50);
+ });
+}
+installShutdown('SIGINT', 130);
+installShutdown('SIGTERM', 143);
+
runCli()
.then(() => {
process.exit(0);
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index e005d786f..c0a511915 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -34,7 +34,7 @@ export function deduplicateByTestIdTarget(
export async function aggregateRunDir(
runDir: string,
- options?: { evalFile?: string; experiment?: string },
+ options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
const content = await readFile(indexPath, 'utf8');
@@ -45,7 +45,18 @@ export async function aggregateRunDir(
const timingPath = path.join(runDir, 'timing.json');
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
- const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+ // Preserve `planned_test_count` from any pre-existing benchmark.json (e.g.
+ // the stub written at run start, or from the original run when this is a
+ // resume) unless an explicit value was passed.
+ const plannedTestCount =
+ options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json')));
+
+ const benchmark = buildBenchmarkArtifact(
+ results,
+ options?.evalFile,
+ options?.experiment,
+ plannedTestCount,
+ );
const benchmarkPath = path.join(runDir, 'benchmark.json');
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
@@ -53,6 +64,17 @@ export async function aggregateRunDir(
return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
}
+async function readPlannedTestCount(benchmarkPath: string): Promise {
+ try {
+ const raw = await readFile(benchmarkPath, 'utf8');
+ const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } };
+ const value = parsed.metadata?.planned_test_count;
+ return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+ } catch {
+ return undefined;
+ }
+}
+
// ---------------------------------------------------------------------------
// Artifact interfaces (snake_case to match skill-creator conventions)
// ---------------------------------------------------------------------------
@@ -110,6 +132,13 @@ export interface BenchmarkArtifact {
readonly targets: readonly string[];
readonly tests_run: readonly string[];
readonly experiment?: string;
+ /**
+ * Total number of test cases the run was planned to execute (across all
+ * targets and eval files). Written at run start so an interrupted run
+ * can be diagnosed as resumable when `tests_run.length < planned_test_count`,
+ * even if every recorded row has `execution_status: ok`.
+ */
+ readonly planned_test_count?: number;
};
readonly run_summary: Record<
string,
@@ -364,6 +393,7 @@ export function buildBenchmarkArtifact(
results: readonly EvaluationResult[],
evalFile = '',
experiment?: string,
+ plannedTestCount?: number,
): BenchmarkArtifact {
const targetSet = new Set();
const testIdSet = new Set();
@@ -457,6 +487,7 @@ export function buildBenchmarkArtifact(
targets,
tests_run: testIds,
experiment,
+ planned_test_count: plannedTestCount,
},
run_summary: runSummary,
per_grader_summary: perEvaluatorSummary,
@@ -464,6 +495,35 @@ export function buildBenchmarkArtifact(
};
}
+/**
+ * Write a stub `benchmark.json` at the start of a run, before any tests
+ * have executed. Carries `planned_test_count` so an interrupted run can
+ * still be detected as resumable even when every recorded row has
+ * `execution_status: ok`.
+ *
+ * The end-of-run write (writeArtifactsFromResults / aggregateRunDir)
+ * overwrites this file with the full summary; preserve `planned_test_count`
+ * by passing it through.
+ */
+export async function writeInitialBenchmarkArtifact(
+ runDir: string,
+ options: {
+ evalFile: string;
+ plannedTestCount: number;
+ experiment?: string;
+ },
+): Promise {
+ await mkdir(runDir, { recursive: true });
+ const stub = buildBenchmarkArtifact(
+ [],
+ options.evalFile,
+ options.experiment,
+ options.plannedTestCount,
+ );
+ const benchmarkPath = path.join(runDir, 'benchmark.json');
+ await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
+}
+
export function buildAggregateGradingArtifact(
results: readonly EvaluationResult[],
): AggregateGradingArtifact {
@@ -826,7 +886,7 @@ export async function writePerTestArtifacts(
export async function writeArtifactsFromResults(
results: readonly EvaluationResult[],
outputDir: string,
- options?: { evalFile?: string; experiment?: string },
+ options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
): Promise<{
testArtifactDir: string;
timingPath: string;
@@ -877,8 +937,16 @@ export async function writeArtifactsFromResults(
const timing = buildTimingArtifact(results);
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
- // Write benchmark
- const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+ // Write benchmark — preserve `planned_test_count` from the run-start stub
+ // (or from the original run when this is a resume) unless an explicit
+ // value was passed by the caller.
+ const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath));
+ const benchmark = buildBenchmarkArtifact(
+ results,
+ options?.evalFile,
+ options?.experiment,
+ plannedTestCount,
+ );
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
await writeJsonlFile(indexPath, indexRecords);
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index db0b0fb19..0f3536061 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -38,6 +38,7 @@ import {
deduplicateByTestIdTarget,
parseJsonlResults,
writeArtifactsFromResults,
+ writeInitialBenchmarkArtifact,
} from './artifact-writer.js';
import { writeBenchmarkJson } from './benchmark-writer.js';
import { loadEnvFromHierarchy } from './env.js';
@@ -1447,6 +1448,21 @@ export async function runEvalCommand(
);
}
+ // Write a stub benchmark.json before dispatching tests, carrying the planned
+ // execution count so an interrupted run can still surface as resumable in
+ // Studio (results.length < planned_test_count) even when every recorded row
+ // has execution_status: ok. The end-of-run write preserves this value via
+ // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
+ // Skip on resume — we want to preserve the *original* planned count.
+ if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
+ const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
+ await writeInitialBenchmarkArtifact(runDir, {
+ evalFile,
+ plannedTestCount: totalEvalCount,
+ experiment: normalizeExperimentName(options.experiment),
+ });
+ }
+
// Eval files run sequentially; within each file, --workers N test cases run in parallel.
// This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
// workspace races without any grouping complexity.
diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
index e40c01c74..ab601e7f2 100644
--- a/apps/cli/src/commands/results/eval-runner.ts
+++ b/apps/cli/src/commands/results/eval-runner.ts
@@ -412,6 +412,35 @@ export function registerEvalRoutes(
}
});
+ // ── Stop a running eval ────────────────────────────────────────────────
+ // POST (not DELETE) because Stop is part of the stop → resume → complete
+ // workflow, not a destructive cancel. The run remains resumable from the
+ // partial index.jsonl on disk. Idempotent: hitting /stop on a terminal
+ // run returns 200 with `stopped: false, reason: 'already_terminal'`
+ // rather than 4xx, so clients can fire-and-forget.
+ //
+ // SIGTERM the spawned CLI; the existing child.on('close') flips status
+ // to 'finished'/'failed'. The CLI's own signal handler walks its tracked
+ // grandchildren (claude/codex/pi/copilot subprocesses) and kills them
+ // before exiting.
+ app.post('/api/eval/run/:id/stop', (c) => {
+ if (readOnly) {
+ return c.json({ error: 'Studio is running in read-only mode' }, 403);
+ }
+ const id = c.req.param('id');
+ const run = activeRuns.get(id ?? '');
+ if (!run) return c.json({ error: 'Run not found' }, 404);
+ if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+ return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+ }
+ try {
+ run.process.kill('SIGTERM');
+ } catch (err) {
+ return c.json({ error: (err as Error).message }, 500);
+ }
+ return c.json({ stopped: true, status: run.status });
+ });
+
// ── Run status ─────────────────────────────────────────────────────────
app.get('/api/eval/status/:id', (c) => {
const id = c.req.param('id');
@@ -576,6 +605,24 @@ export function registerEvalRoutes(
}
});
+ app.post('/api/benchmarks/:benchmarkId/eval/run/:id/stop', (c) => {
+ if (readOnly) {
+ return c.json({ error: 'Studio is running in read-only mode' }, 403);
+ }
+ const id = c.req.param('id');
+ const run = activeRuns.get(id ?? '');
+ if (!run) return c.json({ error: 'Run not found' }, 404);
+ if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+ return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+ }
+ try {
+ run.process.kill('SIGTERM');
+ } catch (err) {
+ return c.json({ error: (err as Error).message }, 500);
+ }
+ return c.json({ stopped: true, status: run.status });
+ });
+
app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => {
const id = c.req.param('id');
const run = activeRuns.get(id ?? '');
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index f88bf212e..8468bdf88 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -347,8 +347,8 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) {
function deriveResumeMeta(
cwd: string,
manifestPath: string,
-): { run_dir?: string; suite_filter?: string } {
- const out: { run_dir?: string; suite_filter?: string } = {};
+): { run_dir?: string; suite_filter?: string; planned_test_count?: number } {
+ const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {};
const runDir = path.dirname(manifestPath);
const relative = path.relative(cwd, runDir);
// path.relative returns '..'-prefixed paths when runDir is outside cwd; keep
@@ -359,15 +359,19 @@ function deriveResumeMeta(
const benchmarkPath = path.join(runDir, 'benchmark.json');
if (existsSync(benchmarkPath)) {
const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
- metadata?: { eval_file?: string };
+ metadata?: { eval_file?: string; planned_test_count?: number };
};
const evalFile = parsed.metadata?.eval_file;
if (typeof evalFile === 'string' && evalFile.trim()) {
out.suite_filter = evalFile.trim();
}
+ const planned = parsed.metadata?.planned_test_count;
+ if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) {
+ out.planned_test_count = planned;
+ }
}
} catch {
- // benchmark.json missing / unreadable / malformed — leave suite_filter unset.
+ // benchmark.json missing / unreadable / malformed — leave fields unset.
}
return out;
}
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 0e69e495a..30c04ee66 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -1003,6 +1003,52 @@ describe('serve app', () => {
});
});
+ // ── POST /api/eval/run/:id/stop — interrupt a running eval ─────────────
+ //
+ // Stop is part of the stop → resume workflow, not a destructive cancel —
+ // POST (not DELETE) and idempotent on already-terminal runs. These tests
+ // validate routing/auth shape (404 unknown id, 403 read-only). The happy
+ // path SIGTERM behavior is covered by manual UAT because it requires a
+ // live subprocess that is reliably mid-run; unit tests that race a launch
+ // against a stop are flaky.
+
+ describe('POST /api/eval/run/:id/stop (stop API)', () => {
+ function makeAppForStop(opts?: { readOnly?: boolean }) {
+ return createApp([], tempDir, undefined, undefined, {
+ studioDir,
+ readOnly: opts?.readOnly === true,
+ });
+ }
+
+ it('returns 404 for an unknown run id', async () => {
+ const app = makeAppForStop();
+ const res = await app.request('/api/eval/run/no-such-id/stop', { method: 'POST' });
+ expect(res.status).toBe(404);
+ });
+
+ it('returns 403 in read-only mode', async () => {
+ const app = makeAppForStop({ readOnly: true });
+ const res = await app.request('/api/eval/run/anything/stop', { method: 'POST' });
+ expect(res.status).toBe(403);
+ });
+
+ it('returns 404 for benchmark-scoped stop with unknown run id', async () => {
+ const app = makeAppForStop();
+ const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id/stop', {
+ method: 'POST',
+ });
+ expect(res.status).toBe(404);
+ });
+
+ it('returns 403 in read-only mode for benchmark-scoped stop', async () => {
+ const app = makeAppForStop({ readOnly: true });
+ const res = await app.request('/api/benchmarks/some-id/eval/run/anything/stop', {
+ method: 'POST',
+ });
+ expect(res.status).toBe(403);
+ });
+ });
+
// ── POST /api/eval/preview — argument shaping for resume flags ─────────
//
// /api/eval/preview is a lightweight endpoint that returns the CLI
diff --git a/apps/studio/src/components/ResumeRunActions.tsx b/apps/studio/src/components/ResumeRunActions.tsx
index d161c9d22..f50ebe5b5 100644
--- a/apps/studio/src/components/ResumeRunActions.tsx
+++ b/apps/studio/src/components/ResumeRunActions.tsx
@@ -35,6 +35,7 @@ export interface ResumeRunActionsProps {
target?: string;
benchmarkId?: string;
isReadOnly: boolean;
+ plannedTestCount?: number;
}
export function ResumeRunActions({
@@ -44,12 +45,13 @@ export function ResumeRunActions({
target,
benchmarkId,
isReadOnly,
+ plannedTestCount,
}: ResumeRunActionsProps) {
const navigate = useNavigate();
const [busy, setBusy] = useState(null);
const [error, setError] = useState(null);
- if (!shouldShowResumeActions(results, isReadOnly)) return null;
+ if (!shouldShowResumeActions(results, isReadOnly, plannedTestCount)) return null;
// Both actions need the run dir + the original eval file. Without those
// we can't target the existing run workspace, so we render the buttons
diff --git a/apps/studio/src/components/StopRunButton.tsx b/apps/studio/src/components/StopRunButton.tsx
new file mode 100644
index 000000000..d0eb65e2b
--- /dev/null
+++ b/apps/studio/src/components/StopRunButton.tsx
@@ -0,0 +1,62 @@
+/**
+ * StopRunButton — pause-style affordance on /jobs/:runId that interrupts
+ * a Studio-launched eval. Stop is part of the stop → resume → complete
+ * workflow, not a destructive cancel: the partial index.jsonl is
+ * preserved and can be resumed in one click from the run-detail page.
+ *
+ * Calls POST /api/eval/run/:id/stop (or the benchmark-scoped variant).
+ * Optimistically flips the local label to "Stopping…" until the next
+ * poll of /api/eval/status/:id observes a terminal state — at which
+ * point the button hides via `shouldShowStopButton`.
+ *
+ * Styling is intentionally neutral (gray, not red) to signal that this
+ * is a pause, not a kill.
+ */
+
+import { useState } from 'react';
+
+import { stopEvalRun } from '~/lib/api';
+
+import { type RunStatus, shouldShowStopButton } from './stop-run-helpers';
+
+export interface StopRunButtonProps {
+ runId: string;
+ status: RunStatus | undefined;
+ isReadOnly: boolean;
+ benchmarkId?: string;
+}
+
+export function StopRunButton({ runId, status, isReadOnly, benchmarkId }: StopRunButtonProps) {
+ const [stopping, setStopping] = useState(false);
+ const [error, setError] = useState(null);
+
+ if (!shouldShowStopButton(status, isReadOnly)) return null;
+
+ async function onClick() {
+ setStopping(true);
+ setError(null);
+ try {
+ await stopEvalRun(runId, benchmarkId);
+ } catch (err) {
+ setError(err instanceof Error ? err.message : 'Failed to stop run');
+ setStopping(false);
+ }
+ // On success, leave `stopping=true`. The status poller will flip to
+ // a terminal state shortly, at which point the button unmounts.
+ }
+
+ return (
+
+
+ {error &&
{error}
}
+
+ );
+}
diff --git a/apps/studio/src/components/resume-run-helpers.test.ts b/apps/studio/src/components/resume-run-helpers.test.ts
index 39e2d807c..9ceb71870 100644
--- a/apps/studio/src/components/resume-run-helpers.test.ts
+++ b/apps/studio/src/components/resume-run-helpers.test.ts
@@ -32,6 +32,23 @@ describe('shouldShowResumeActions', () => {
it('hides on empty results', () => {
expect(shouldShowResumeActions([], false)).toBe(false);
});
+
+ it('shows for an incomplete partial run with only ok rows when planned_test_count exceeds results', () => {
+ // Stop button / Ctrl+C scenario: 5 of 10 planned tests finished
+ // successfully before the run was killed. No execution errors, but
+ // still resumable.
+ const results = [ok('a'), ok('b'), ok('c'), ok('d'), ok('e')];
+ expect(shouldShowResumeActions(results, false, 10)).toBe(true);
+ });
+
+ it('hides when results match planned_test_count (complete passing run)', () => {
+ const results = [ok('a'), ok('b'), ok('c')];
+ expect(shouldShowResumeActions(results, false, 3)).toBe(false);
+ });
+
+ it('hides incomplete partial run in read-only mode', () => {
+ expect(shouldShowResumeActions([ok('a')], true, 5)).toBe(false);
+ });
});
describe('buildResumeRequestBody', () => {
diff --git a/apps/studio/src/components/resume-run-helpers.ts b/apps/studio/src/components/resume-run-helpers.ts
index fcbf805a0..b8e6451f5 100644
--- a/apps/studio/src/components/resume-run-helpers.ts
+++ b/apps/studio/src/components/resume-run-helpers.ts
@@ -21,14 +21,29 @@ export interface BuildResumeRequestParams {
}
/**
- * Whether the resume actions should be visible. The button only makes sense
- * when at least one row failed with an execution error and the user has
- * write access (read-only mode hides the entire control rather than
- * showing a disabled button — see issue acceptance criteria).
+ * Whether the resume actions should be visible. The button is shown when:
+ * 1. At least one recorded row has `execution_status: execution_error`, OR
+ * 2. The run is *incomplete* — fewer recorded rows than the originally
+ * planned execution count, even if every recorded row is `ok`.
+ *
+ * Case 2 covers Stop-button / Ctrl+C interruptions where the run produced
+ * only successful rows before being killed: there is no `execution_error`
+ * to anchor on, but the run is still resumable. `plannedTestCount` is
+ * persisted in `benchmark.json.metadata` at run start (see
+ * `writeInitialBenchmarkArtifact`).
+ *
+ * Hidden in read-only mode — the server also returns 403, but UI-level
+ * hiding avoids dead controls.
*/
-export function shouldShowResumeActions(results: EvalResult[], isReadOnly: boolean): boolean {
+export function shouldShowResumeActions(
+ results: EvalResult[],
+ isReadOnly: boolean,
+ plannedTestCount?: number,
+): boolean {
if (isReadOnly) return false;
- return results.some((r) => r.executionStatus === 'execution_error');
+ if (results.some((r) => r.executionStatus === 'execution_error')) return true;
+ if (plannedTestCount !== undefined && results.length < plannedTestCount) return true;
+ return false;
}
/**
diff --git a/apps/studio/src/components/stop-run-helpers.test.ts b/apps/studio/src/components/stop-run-helpers.test.ts
new file mode 100644
index 000000000..3bd407b1b
--- /dev/null
+++ b/apps/studio/src/components/stop-run-helpers.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from 'bun:test';
+
+import { isTerminalRunStatus, shouldShowStopButton } from './stop-run-helpers';
+
+describe('isTerminalRunStatus', () => {
+ it('treats finished and failed as terminal', () => {
+ expect(isTerminalRunStatus('finished')).toBe(true);
+ expect(isTerminalRunStatus('failed')).toBe(true);
+ });
+
+ it('treats live states and unknowns as non-terminal', () => {
+ expect(isTerminalRunStatus('starting')).toBe(false);
+ expect(isTerminalRunStatus('running')).toBe(false);
+ expect(isTerminalRunStatus(undefined)).toBe(false);
+ });
+});
+
+describe('shouldShowStopButton', () => {
+ it('shows while the run is live', () => {
+ expect(shouldShowStopButton('starting', false)).toBe(true);
+ expect(shouldShowStopButton('running', false)).toBe(true);
+ });
+
+ it('hides once the run reaches a terminal state', () => {
+ expect(shouldShowStopButton('finished', false)).toBe(false);
+ expect(shouldShowStopButton('failed', false)).toBe(false);
+ });
+
+ it('hides in read-only mode regardless of status', () => {
+ expect(shouldShowStopButton('running', true)).toBe(false);
+ });
+
+ it('hides when the status is undefined', () => {
+ expect(shouldShowStopButton(undefined, false)).toBe(false);
+ });
+});
diff --git a/apps/studio/src/components/stop-run-helpers.ts b/apps/studio/src/components/stop-run-helpers.ts
new file mode 100644
index 000000000..fcef0e56a
--- /dev/null
+++ b/apps/studio/src/components/stop-run-helpers.ts
@@ -0,0 +1,28 @@
+/**
+ * Pure helpers backing StopRunButton, isolated for unit testing.
+ *
+ * Intentionally side-effect-free so the visibility matrix is testable
+ * without rendering React.
+ *
+ * To extend: extend the union of statuses recognized as non-terminal as
+ * the server adds new lifecycle states. Today the server only emits
+ * starting / running / finished / failed; anything not in the terminal
+ * set is treated as live.
+ */
+
+export type RunStatus = 'starting' | 'running' | 'finished' | 'failed' | (string & {});
+
+export function isTerminalRunStatus(status: RunStatus | undefined): boolean {
+ return status === 'finished' || status === 'failed';
+}
+
+/**
+ * Whether the Stop button should be visible. Hidden when the run is
+ * terminal (no process to kill) and in read-only mode (the API also
+ * 403s, but UI-level hiding avoids dead controls).
+ */
+export function shouldShowStopButton(status: RunStatus | undefined, isReadOnly: boolean): boolean {
+ if (isReadOnly) return false;
+ if (!status) return false;
+ return !isTerminalRunStatus(status);
+}
diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts
index 1ea719e6e..eb4b95010 100644
--- a/apps/studio/src/lib/api.ts
+++ b/apps/studio/src/lib/api.ts
@@ -539,6 +539,21 @@ export async function launchEvalRun(
return res.json() as Promise;
}
+export async function stopEvalRun(
+ runId: string,
+ benchmarkId?: string,
+): Promise<{ stopped: boolean; reason?: string; status?: string }> {
+ const url = benchmarkId
+ ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}/stop`
+ : `/api/eval/run/${runId}/stop`;
+ const res = await fetch(url, { method: 'POST' });
+ if (!res.ok) {
+ const err = await res.json().catch(() => ({ error: res.statusText }));
+ throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`);
+ }
+ return res.json() as Promise<{ stopped: boolean; reason?: string; status?: string }>;
+}
+
export function evalRunStatusOptions(runId: string | null) {
return queryOptions({
queryKey: ['eval-status', runId],
diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts
index 46f321d32..6ef4dc1b4 100644
--- a/apps/studio/src/lib/types.ts
+++ b/apps/studio/src/lib/types.ts
@@ -80,6 +80,8 @@ export interface RunDetailResponse {
run_dir?: string;
/** Eval file path the run was launched against, if recorded in benchmark.json. Local runs only. */
suite_filter?: string;
+ /** Total (test_id, target) executions originally planned for this run. Used to detect incomplete partial runs as resumable. Local runs only, populated when the run was launched after the planned-count metadata feature shipped. */
+ planned_test_count?: number;
}
export interface SuiteSummary {
diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
index 169d6d643..2bf2b9ea6 100644
--- a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
+++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
@@ -64,7 +64,9 @@ function BenchmarkEvalDetailPage() {
Run: {runId} / Eval: {evalId}