diff --git a/.agents/conventions.md b/.agents/conventions.md
index ca4da8dbf..6e1d0deae 100644
--- a/.agents/conventions.md
+++ b/.agents/conventions.md
@@ -29,6 +29,21 @@ When spawning a subprocess with an explicit `cwd`, pass user-supplied `args` thr
 - Those heuristics miss bare relative paths such as `plugins/foo`, can corrupt flag-value pairs such as `--config=./x`, and duplicate behavior the subprocess already handles.
 - See `docs/solutions/best-practices/trust-subprocess-cwd-for-relative-path-resolution.md`.
 
+## Git Remote Ownership
+
+Treat an existing Git checkout's remote configuration as user-owned state.
+AgentV may read remotes, fetch from a configured remote name, and push results
+refs to that remote, but it must not run `git remote add` or `git remote
+set-url` in an existing checkout as a side effect of Dashboard status, results
+sync, eval publishing, or WIP checkpoint handling. This applies especially to
+`results.repo.path: .`, where the source checkout's existing `origin` is the
+authoritative remote.
+
+If AgentV needs a separate results checkout and the configured path is missing
+or empty, create it with `git clone` and the requested remote name. If the path
+already exists, use its current Git config as-is or fail with clear setup
+guidance; do not repair, rewrite, or synthesize remotes in place.
+
 ## Naming: Project vs Benchmark
 
 These terms are distinct and not interchangeable.
@@ -36,7 +51,7 @@ These terms are distinct and not interchangeable.
 - Project: the top-level container Dashboard organizes around, backed by a registered workspace directory with `.agentv/`, run artifacts, traces, and experiments. The registry lives in `~/.agentv/projects.yaml` and is modeled by `ProjectEntry` and `ProjectRegistry` in `packages/core/src/projects.ts`.
 - Benchmark: a curated eval suite designed to measure something specific, in the academic ML sense. Example directories using this meaning are correctly named and should not be renamed.
 
-The legacy `~/.agentv/benchmarks.yaml` file is auto-migrated to `projects.yaml` by `migrateLegacyBenchmarksFile()`. The unrelated per-run `benchmark.json` artifact is a third, separate concept and should keep that name.
+The legacy `~/.agentv/benchmarks.yaml` file is auto-migrated to `projects.yaml` by `migrateLegacyBenchmarksFile()`. Run-level results metadata lives in `summary.json`, with `index.jsonl` as the discovery anchor.
 
 Rule of thumb:
 
diff --git a/STRATEGY.md b/STRATEGY.md
index a021b0101..5f52388d9 100644
--- a/STRATEGY.md
+++ b/STRATEGY.md
@@ -21,7 +21,7 @@ AgentV stays repo-native and workspace-native: it runs or imports evaluations ar
 
 - **Repo-native eval success** - Share of dogfood and example eval flows that run against real workspaces, hooks, repo materialization, or imported artifacts without extra infrastructure; measured by CI and manual UAT on canonical suites.
 - **Time to inspect a run** - Time from completed `agentv eval` to usable local review, compare, or report output from the canonical run bundle; measured through CLI and Dashboard/report workflows.
-- **Artifact portability coverage** - Share of integrations and follow-on workflows that consume `index.jsonl`, `benchmark.json`, trace sidecars, or imported run bundles instead of bespoke stores; measured by adapter smoke tests, docs, and example coverage.
+- **Artifact portability coverage** - Share of integrations and follow-on workflows that consume `index.jsonl`, `summary.json`, trace sidecars, or imported run bundles instead of bespoke stores; measured by adapter smoke tests, docs, and example coverage.
 - **Git-backed results reliability** - Success rate for publish, sync, resume, and WIP checkpoint flows across local branches and dedicated results repos; measured by integration tests and manual end-to-end verification.
 
 ## Tracks
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 09e2d098e..e1591675b 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -3,7 +3,6 @@ import path from 'node:path';
 import {
   type AdditionalResultArtifactsWriter,
   type AggregateGradingArtifact,
-  type BenchmarkArtifact,
   type EvalTest,
   type EvaluationResult,
   type ExperimentArtifactMetadata,
@@ -11,14 +10,16 @@ import {
   type GradingArtifact,
   type IndexArtifactEntry,
   RESULT_INDEX_FILENAME,
+  RUN_SUMMARY_FILENAME,
   type ResultIndexArtifact,
+  type RunSummaryArtifact,
   type TimingArtifact,
   aggregateRunDir,
   buildAggregateGradingArtifact,
-  buildBenchmarkArtifact,
   buildIndexArtifactEntry as buildCoreIndexArtifactEntry,
   buildResultIndexArtifact as buildCoreResultIndexArtifact,
   buildGradingArtifact,
+  buildRunSummaryArtifact,
   buildTestTargetKey,
   buildTimingArtifact,
   deduplicateByTestIdTarget,
@@ -26,7 +27,7 @@ import {
   writeArtifacts,
   writeArtifactsFromResults as writeCoreArtifactsFromResults,
   writePerTestArtifacts as writeCorePerTestArtifacts,
-  writeInitialBenchmarkArtifact,
+  writeInitialRunSummaryArtifact,
 } from '@agentv/core';
 import type { TargetDefinition } from '@agentv/core';
 
@@ -39,22 +40,23 @@ import {
 export {
   aggregateRunDir,
   buildAggregateGradingArtifact,
-  buildBenchmarkArtifact,
+  buildRunSummaryArtifact,
   buildGradingArtifact,
   buildTestTargetKey,
   buildTimingArtifact,
   deduplicateByTestIdTarget,
   parseJsonlResults,
   RESULT_INDEX_FILENAME,
+  RUN_SUMMARY_FILENAME,
   writeArtifacts,
-  writeInitialBenchmarkArtifact,
+  writeInitialRunSummaryArtifact,
 };
 export type {
   AggregateGradingArtifact,
-  BenchmarkArtifact,
   GradingArtifact,
   IndexArtifactEntry,
   ResultIndexArtifact,
+  RunSummaryArtifact,
   TimingArtifact,
 };
 
@@ -90,15 +92,15 @@ export function buildIndexArtifactEntry(
   options: {
     outputDir: string;
     artifactDir?: string;
-    gradingPath: string;
-    timingPath: string;
+    gradingPath?: string;
+    timingPath?: string;
+    summaryPath?: string;
     outputPath?: string;
     answerPath?: string;
     tracePath?: string;
     transcriptPath?: string;
     metricsPath?: string;
     rawProviderLogPath?: string;
-    inputPath?: string;
     responsePath?: string;
     taskBundle?: MaterializedTaskBundlePaths;
   },
@@ -240,8 +242,7 @@ export async function writeArtifactsFromResults(
   },
 ): Promise<{
   testArtifactDir: string;
-  timingPath: string;
-  benchmarkPath: string;
+  summaryPath: string;
   indexPath: string;
 }> {
   return writeCoreArtifactsFromResults(results, outputDir, {
diff --git a/apps/cli/src/commands/eval/commands/aggregate.ts b/apps/cli/src/commands/eval/commands/aggregate.ts
index 7483b8412..275e792ab 100644
--- a/apps/cli/src/commands/eval/commands/aggregate.ts
+++ b/apps/cli/src/commands/eval/commands/aggregate.ts
@@ -6,7 +6,7 @@ import { aggregateRunDir } from '../artifact-writer.js';
 export const evalAggregateCommand = command({
   name: 'aggregate',
   description:
-    'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
+    'Recompute summary.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
   args: {
     runDir: positional({
       type: string,
@@ -16,9 +16,8 @@ export const evalAggregateCommand = command({
   },
   handler: async (args) => {
     const runDir = path.resolve(args.runDir);
-    const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir);
+    const { summaryPath, testCount, targetCount } = await aggregateRunDir(runDir);
     console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`);
-    console.log(`  Benchmark: ${benchmarkPath}`);
-    console.log(`  Timing:    ${timingPath}`);
+    console.log(`  Summary: ${summaryPath}`);
   },
 });
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index a498f4ccc..3b392ce7c 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -52,7 +52,7 @@ export const evalRunCommand = command({
       long: 'output',
       short: 'o',
       description:
-        'Run artifact directory (writes index.jsonl, benchmark.json, timing, and per-test artifacts)',
+        'Run artifact directory (writes index.jsonl, summary.json, and per-case artifacts)',
     }),
     outputFormat: option({
       type: optional(string),
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 5e2eb1058..014c0ab77 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -60,7 +60,7 @@ import {
   deduplicateByTestIdTarget,
   parseJsonlResults,
   writeArtifactsFromResults,
-  writeInitialBenchmarkArtifact,
+  writeInitialRunSummaryArtifact,
 } from './artifact-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { resolveOtelBackend } from './otel-backends.js';
@@ -1996,7 +1996,7 @@ export async function runEvalCommand(
     );
   }
 
-  // Write a stub benchmark.json before dispatching tests, carrying the planned
+  // Write a stub summary.json before dispatching tests, carrying the planned
   // execution count so an interrupted run can still surface as resumable in
   // Dashboard (results.length < planned_test_count) even when every recorded row
   // has execution_status: ok. The end-of-run write preserves this value via
@@ -2004,7 +2004,7 @@ export async function runEvalCommand(
   // Skip on resume — we want to preserve the *original* planned count.
   if (!isResumeAppend && totalEvalCount > 0) {
     const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
-    await writeInitialBenchmarkArtifact(runDir, {
+    await writeInitialRunSummaryArtifact(runDir, {
       evalFile,
       plannedTestCount: totalEvalCount,
       experiment: normalizeExperimentName(options.experiment),
@@ -2262,42 +2262,36 @@ export async function runEvalCommand(
           sourceTests,
           taskBundleTargets,
         });
-        const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir(
+        const { summaryPath } = await aggregateRunDir(runDir, {
+          evalFile,
+          experiment: normalizeExperimentName(options.experiment),
+          experimentMetadata: options.experimentMetadata,
+        });
+        const indexPath = path.join(runDir, 'index.jsonl');
+        console.log(`Artifact workspace updated: ${runDir}`);
+        console.log(`  Index: ${indexPath}`);
+        console.log(`  Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
+        console.log(`  Summary: ${summaryPath}`);
+      } else {
+        const { testArtifactDir, summaryPath, indexPath } = await writeArtifactsFromResults(
+          allResults,
           runDir,
           {
             evalFile,
             experiment: normalizeExperimentName(options.experiment),
             experimentMetadata: options.experimentMetadata,
+            cwd,
+            repoRoot,
+            sourceTests,
+            taskBundleTargets,
           },
         );
-        const indexPath = path.join(runDir, 'index.jsonl');
-        console.log(`Artifact workspace updated: ${runDir}`);
-        console.log(`  Index: ${indexPath}`);
-        console.log(`  Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
-        console.log(`  Timing: ${timingPath}`);
-        console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
-      } else {
-        const {
-          testArtifactDir,
-          timingPath,
-          benchmarkPath: workspaceBenchmarkPath,
-          indexPath,
-        } = await writeArtifactsFromResults(allResults, runDir, {
-          evalFile,
-          experiment: normalizeExperimentName(options.experiment),
-          experimentMetadata: options.experimentMetadata,
-          cwd,
-          repoRoot,
-          sourceTests,
-          taskBundleTargets,
-        });
         console.log(`Artifact workspace written to: ${runDir}`);
         console.log(`  Index: ${indexPath}`);
         console.log(
           `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
         );
-        console.log(`  Timing: ${timingPath}`);
-        console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
+        console.log(`  Summary: ${summaryPath}`);
       }
     }
 
diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts
index 0df6e0c53..fa6101b5f 100644
--- a/apps/cli/src/commands/inspect/utils.ts
+++ b/apps/cli/src/commands/inspect/utils.ts
@@ -577,10 +577,10 @@ function buildRunId(relativeRunPath: string): string {
 
 function readRunDisplayName(runDir: string): string | undefined {
   try {
-    const benchmark = JSON.parse(readFileSync(path.join(runDir, 'benchmark.json'), 'utf8')) as {
+    const summary = JSON.parse(readFileSync(path.join(runDir, 'summary.json'), 'utf8')) as {
       metadata?: { display_name?: unknown };
     };
-    const displayName = benchmark.metadata?.display_name;
+    const displayName = summary.metadata?.display_name;
     return typeof displayName === 'string' && displayName.trim() ? displayName.trim() : undefined;
   } catch {
     return undefined;
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
index f7466f0ed..fed6dce88 100644
--- a/apps/cli/src/commands/pipeline/bench.ts
+++ b/apps/cli/src/commands/pipeline/bench.ts
@@ -7,7 +7,7 @@
  * Writes:
  *   - <test-id>/grading.json  (per-test grading breakdown)
  *   - index.jsonl             (one line per test)
- *   - benchmark.json          (aggregate statistics)
+ *   - summary.json            (aggregate statistics)
  */
 import { existsSync } from 'node:fs';
 import { readFile, readdir, writeFile } from 'node:fs/promises';
@@ -199,9 +199,9 @@ export const evalBenchCommand = command({
       'utf8',
     );
 
-    // Write benchmark.json
+    // Write summary.json
     const passRateStats = computeStats(allPassRates);
-    const benchmark = {
+    const summary = {
       metadata: {
         eval_file: manifest.eval_file,
         timestamp: manifest.timestamp,
@@ -216,11 +216,24 @@ export const evalBenchCommand = command({
           tokens: { mean: 0, stddev: 0 },
         },
       },
+      timing: {
+        total_tokens: 0,
+        duration_ms: 0,
+        total_duration_seconds: 0,
+        cost_usd: null,
+        token_usage: { input: 0, output: 0, reasoning: 0 },
+        usage_sources: {
+          token_usage: 'unavailable',
+          total_tokens: 'unavailable',
+          duration: 'unavailable',
+          cost: 'unavailable',
+        },
+      },
       notes: [],
     };
     await writeFile(
-      join(exportDir, 'benchmark.json'),
-      `${JSON.stringify(benchmark, null, 2)}\n`,
+      join(exportDir, 'summary.json'),
+      `${JSON.stringify(summary, null, 2)}\n`,
       'utf8',
     );
 
diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts
index 18d7bbb99..113fa2461 100644
--- a/apps/cli/src/commands/results/combine-run.ts
+++ b/apps/cli/src/commands/results/combine-run.ts
@@ -5,8 +5,8 @@
  * Combines two or more local run workspace manifests into a new local run
  * workspace. The writer keeps per-test artifacts self-contained by copying
  * referenced source files under `sources/source-N/` and rewriting manifest
- * paths, while recomputing top-level `timing.json` and `benchmark.json` from
- * the selected result rows.
+ * paths, while recomputing top-level `summary.json` from the selected result
+ * rows.
  */
 
 import {
@@ -28,10 +28,9 @@ import type {
 } from '@agentv/core';
 
 import {
-  type BenchmarkArtifact,
-  buildBenchmarkArtifact,
+  type RunSummaryArtifact,
+  buildRunSummaryArtifact,
   buildTestTargetKey,
-  buildTimingArtifact,
 } from '../eval/artifact-writer.js';
 import {
   buildDefaultRunDirFromName,
@@ -103,8 +102,7 @@ export interface CombineRunResult {
   readonly runDir: string;
   readonly runId: string;
   readonly manifestPath: string;
-  readonly benchmarkPath: string;
-  readonly timingPath: string;
+  readonly summaryPath: string;
   readonly displayName: string;
   readonly experiment: string;
   readonly combinedFromRunIds: readonly string[];
@@ -126,13 +124,13 @@ function readManifestRecords(manifestPath: string): ResultManifestRecord[] {
     .map(parseJsonlLine);
 }
 
-function readBenchmarkMetadata(manifestPath: string): {
+function readSummaryMetadata(manifestPath: string): {
   timestamp?: string;
   displayName?: string;
 } {
   try {
-    const benchmarkPath = path.join(path.dirname(manifestPath), 'benchmark.json');
-    const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
+    const summaryPath = path.join(path.dirname(manifestPath), 'summary.json');
+    const parsed = JSON.parse(readFileSync(summaryPath, 'utf8')) as {
       metadata?: { timestamp?: string; display_name?: string };
     };
     return {
@@ -168,7 +166,7 @@ function loadSources(sources: readonly CombineRunSource[]): LoadedSource[] {
     if (records.length !== results.length) {
       throw new Error(`Manifest could not be hydrated completely: ${manifestPath}`);
     }
-    const metadata = readBenchmarkMetadata(manifestPath);
+    const metadata = readSummaryMetadata(manifestPath);
     return {
       ...source,
       index,
@@ -363,7 +361,6 @@ function resolveCombinedExperiment(
 
 const MANIFEST_PATH_FIELDS = [
   'artifact_dir',
-  'benchmark_path',
   'summary_path',
   'grading_path',
   'timing_path',
@@ -604,13 +601,9 @@ export function combineRunSources(options: CombineRunOptions): CombineRunResult
   const manifestPath = path.join(runDir, 'index.jsonl');
   writeJsonl(manifestPath, records);
 
-  const timing = buildTimingArtifact(results);
-  const timingPath = path.join(runDir, 'timing.json');
-  writeJson(timingPath, timing);
-
-  const benchmark = buildBenchmarkArtifact(results, '', 'combined', results.length);
-  const benchmarkWithMetadata: BenchmarkArtifact & {
-    metadata: BenchmarkArtifact['metadata'] & {
+  const summary = buildRunSummaryArtifact(results, '', 'combined', results.length);
+  const summaryWithMetadata: RunSummaryArtifact & {
+    metadata: RunSummaryArtifact['metadata'] & {
       display_name: string;
       combined_from_run_ids: readonly string[];
       combined_from_display_names: readonly string[];
@@ -618,10 +611,10 @@ export function combineRunSources(options: CombineRunOptions): CombineRunResult
       duplicate_policy: Exclude<CombineDuplicatePolicy, 'prompt'> | 'prompt';
     };
   } = {
-    ...benchmark,
+    ...summary,
     metadata: {
-      ...benchmark.metadata,
-      timestamp: startedAt ?? benchmark.metadata.timestamp,
+      ...summary.metadata,
+      timestamp: startedAt ?? summary.metadata.timestamp,
       display_name: displayName,
       experiment,
       combined_from_run_ids: loadedSources.map((source) => source.id),
@@ -629,16 +622,15 @@ export function combineRunSources(options: CombineRunOptions): CombineRunResult
       duplicate_policy: options.duplicatePolicy,
     },
   };
-  const benchmarkPath = path.join(runDir, 'benchmark.json');
-  writeJson(benchmarkPath, benchmarkWithMetadata);
+  const summaryPath = path.join(runDir, 'summary.json');
+  writeJson(summaryPath, summaryWithMetadata);
 
   const tags = [...new Set(loadedSources.flatMap((source) => source.tags ?? []))].sort();
   return {
     runDir,
     runId: toRunId(options.cwd, runDir),
     manifestPath,
-    benchmarkPath,
-    timingPath,
+    summaryPath,
     displayName,
     experiment,
     combinedFromRunIds: loadedSources.map((source) => source.id),
diff --git a/apps/cli/src/commands/results/combine.ts b/apps/cli/src/commands/results/combine.ts
index 2f96c4e38..4566df69f 100644
--- a/apps/cli/src/commands/results/combine.ts
+++ b/apps/cli/src/commands/results/combine.ts
@@ -114,7 +114,7 @@ export const resultsCombineCommand = command({
     displayName: option({
       type: optional(string),
       long: 'display-name',
-      description: 'Display name stored in benchmark.json metadata',
+      description: 'Display name stored in summary.json metadata',
     }),
     duplicatePolicy: option({
       type: optional(oneOf(['prompt', 'error', 'latest'])),
@@ -175,8 +175,7 @@ export const resultsCombineCommand = command({
       });
       console.log(`Combined ${result.testCount} result row(s) into ${result.runDir}`);
       console.log(`  Run ID: ${result.runId}`);
-      console.log(`  Benchmark: ${result.benchmarkPath}`);
-      console.log(`  Timing:    ${result.timingPath}`);
+      console.log(`  Summary: ${result.summaryPath}`);
       if (result.duplicateConflicts.length > 0) {
         console.log(`  Duplicates handled: ${result.duplicateConflicts.length}`);
       }
diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
index bfd4e51df..83dbec6f0 100644
--- a/apps/cli/src/commands/results/export.ts
+++ b/apps/cli/src/commands/results/export.ts
@@ -4,16 +4,16 @@
  *
  * Output structure:
  *   <output-dir>/
- *     benchmark.json           — aggregate scores, pass/fail counts, timing
+ *     summary.json             — run aggregate scores, metadata, and timing
  *     index.jsonl              — per-test manifest with artifact pointers
  *     <test-id>/
- *       grading.json           — per-test grading artifact (assertions, graders)
- *       timing.json            — per-test timing artifact
- *       outputs/answer.md              — human-readable agent response for this test
- *       task/PROMPT.md               — human-readable input messages for this test
+ *       summary.json           — per-case aggregate
+ *       run-1/result.json      — per-run result
+ *       run-1/grading.json     — per-run grading artifact (assertions, graders)
+ *       run-1/metrics.json     — per-run metrics artifact
  *
  * This module delegates artifact building to the shared artifact-writer so
- * that benchmark/grading/timing schemas stay aligned with `agentv eval`.
+ * that summary/grading/timing schemas stay aligned with `agentv eval`.
  *
  * How to extend:
  *   - To change artifact schemas, update artifact-writer.ts (single source of truth).
diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
index 679cff1ef..0372b79e8 100644
--- a/apps/cli/src/commands/results/manifest.ts
+++ b/apps/cli/src/commands/results/manifest.ts
@@ -29,6 +29,14 @@ export interface ResultManifestRecord {
   readonly target?: string;
   readonly score: number;
   readonly scores?: readonly Record<string, unknown>[];
+  readonly trials?: readonly {
+    readonly attempt?: number;
+    readonly run_path?: string;
+    readonly score?: number;
+    readonly verdict?: string;
+    readonly [key: string]: unknown;
+  }[];
+  readonly aggregation?: Record<string, unknown>;
   readonly execution_status?: string;
   readonly error?: string;
   readonly cost_usd?: number;
@@ -39,7 +47,6 @@ export interface ResultManifestRecord {
     readonly reasoning?: number;
   };
   readonly trace?: Record<string, unknown>;
-  readonly benchmark_path?: string;
   readonly summary_path?: string;
   readonly grading_path?: string;
   readonly timing_path?: string;
diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts
index e6b69c6d5..042259775 100644
--- a/apps/cli/src/commands/results/projection-bundle.ts
+++ b/apps/cli/src/commands/results/projection-bundle.ts
@@ -87,7 +87,6 @@ export type ProjectionBundleArtifactRefs = Partial<
   Pick<
     IndexArtifactEntry,
     | 'artifact_dir'
-    | 'benchmark_path'
     | 'summary_path'
     | 'grading_path'
     | 'timing_path'
@@ -173,7 +172,6 @@ function artifactRefs(
   return dropUndefined({
     ...metadataRefs,
     artifact_dir: indexEntry.artifact_dir,
-    benchmark_path: indexEntry.benchmark_path,
     summary_path: indexEntry.summary_path,
     grading_path: indexEntry.grading_path,
     input_path: indexEntry.input_path,
diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts
index 109b584d3..113523dfd 100644
--- a/apps/cli/src/commands/results/report.ts
+++ b/apps/cli/src/commands/results/report.ts
@@ -13,7 +13,7 @@ interface ReportManifestRecord {
   readonly eval_file?: string;
 }
 
-interface BenchmarkMetadata {
+interface RunSummaryMetadata {
   readonly metadata?: {
     readonly eval_file?: string;
   };
@@ -33,15 +33,15 @@ function normalizeEvalFileLabel(value: string | undefined): string | undefined {
     .replace(/\.jsonl$/i, '');
 }
 
-function readBenchmarkEvalFile(sourceFile: string): string | undefined {
-  const benchmarkPath = path.join(path.dirname(sourceFile), 'benchmark.json');
-  if (!existsSync(benchmarkPath)) {
+function readSummaryEvalFile(sourceFile: string): string | undefined {
+  const summaryPath = path.join(path.dirname(sourceFile), 'summary.json');
+  if (!existsSync(summaryPath)) {
     return undefined;
   }
 
   try {
-    const benchmark = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as BenchmarkMetadata;
-    return normalizeEvalFileLabel(benchmark.metadata?.eval_file);
+    const summary = JSON.parse(readFileSync(summaryPath, 'utf8')) as RunSummaryMetadata;
+    return normalizeEvalFileLabel(summary.metadata?.eval_file);
   } catch {
     return undefined;
   }
@@ -55,11 +55,11 @@ function serializeReportResult(
   result: EvaluationResult,
   sourceFile: string,
   manifestRecord?: ReportManifestRecord,
-  benchmarkEvalFile?: string,
+  summaryEvalFile?: string,
 ): Record<string, unknown> {
   const fallbackEvalFile =
     normalizeEvalFileLabel(manifestRecord?.eval_file) ??
-    benchmarkEvalFile ??
+    summaryEvalFile ??
     normalizeEvalFileLabel(result.suite) ??
     path.basename(path.dirname(sourceFile));
 
@@ -90,7 +90,7 @@ export async function loadReportSource(
   sourceFile: string;
   results: EvaluationResult[];
   records: readonly ReportManifestRecord[];
-  benchmarkEvalFile?: string;
+  summaryEvalFile?: string;
 }> {
   const { sourceFile } = await resolveSourceFile(source, cwd);
   const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd);
@@ -106,7 +106,7 @@ export async function loadReportSource(
     sourceFile: resolvedSourceFile,
     results,
     records,
-    benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile),
+    summaryEvalFile: readSummaryEvalFile(resolvedSourceFile),
   };
 }
 
@@ -114,14 +114,14 @@ export function renderResultsReport(
   results: readonly EvaluationResult[],
   sourceFile: string,
   records: readonly ReportManifestRecord[],
-  benchmarkEvalFile?: string,
+  summaryEvalFile?: string,
 ): string {
   if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) {
     throw new Error('Report template is missing __DATA_PLACEHOLDER__');
   }
 
   const rows = results.map((result, index) =>
-    serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile),
+    serializeReportResult(result, sourceFile, records[index], summaryEvalFile),
   );
   const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/');
   return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', () => dataJson);
@@ -132,13 +132,13 @@ export async function writeResultsReport(
   outputPath: string | undefined,
   cwd: string,
 ): Promise<{ sourceFile: string; outputPath: string; html: string }> {
-  const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
+  const { sourceFile, results, records, summaryEvalFile } = await loadReportSource(source, cwd);
   const resolvedOutputPath = outputPath
     ? path.isAbsolute(outputPath)
       ? outputPath
       : path.resolve(cwd, outputPath)
     : deriveReportPath(sourceFile);
-  const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile);
+  const html = renderResultsReport(results, sourceFile, records, summaryEvalFile);
 
   mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
   writeFileSync(resolvedOutputPath, html, 'utf8');
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 7685f7660..d6f649fce 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -640,6 +640,36 @@ function normalizeArtifactRelativePath(relativePath: string): string | undefined
   return segments.join('/');
 }
 
+function requestedArtifactDir(c: C): { value?: string; error?: string } {
+  const raw = c.req.query('artifact_dir')?.trim();
+  if (!raw) {
+    return {};
+  }
+  const normalized = normalizeArtifactRelativePath(raw);
+  if (!normalized) {
+    return { error: 'Invalid artifact_dir' };
+  }
+  return { value: normalized };
+}
+
+function manifestRecordSelection(
+  records: readonly ResultManifestRecord[],
+  evalId: string,
+  artifactDir?: string,
+): { record: ResultManifestRecord; index: number } | undefined {
+  return records
+    .map((record, index) => ({ record, index }))
+    .find(({ record }) => {
+      if (record.test_id !== evalId) {
+        return false;
+      }
+      if (!artifactDir) {
+        return true;
+      }
+      return normalizeArtifactRelativePath(record.artifact_dir ?? '') === artifactDir;
+    });
+}
+
 function relativeRunPathFromNormalizedManifestPath(manifestPath: string): string | undefined {
   const parts = manifestPath.split('/').filter(Boolean);
   const runsIndex = parts.lastIndexOf('runs');
@@ -765,6 +795,46 @@ function displayPathFromArtifactKey(key: string | undefined, runPath: string | u
   return normalizeArtifactRelativePath(normalizedKey.slice(runPrefix.length)) ?? normalizedKey;
 }
 
+function addTrialRunCatalogEntries(
+  entries: ArtifactCatalogEntry[],
+  seen: Set<string>,
+  record: ResultManifestRecord,
+): void {
+  const artifactDir = record.artifact_dir
+    ? normalizeArtifactRelativePath(record.artifact_dir)
+    : undefined;
+  if (!artifactDir) return;
+  for (const trial of record.trials ?? []) {
+    const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined;
+    if (!runPath) continue;
+    const runDir = path.posix.join(artifactDir, runPath);
+    addDirectArtifactCatalogEntry(
+      entries,
+      seen,
+      path.posix.join(runDir, 'result.json'),
+      'artifact',
+    );
+    addDirectArtifactCatalogEntry(
+      entries,
+      seen,
+      path.posix.join(runDir, 'grading.json'),
+      'artifact',
+    );
+    addDirectArtifactCatalogEntry(
+      entries,
+      seen,
+      path.posix.join(runDir, 'metrics.json'),
+      'artifact',
+    );
+    addDirectArtifactCatalogEntry(
+      entries,
+      seen,
+      path.posix.join(runDir, 'timing.json'),
+      'artifact',
+    );
+  }
+}
+
 function buildResultArtifactCatalog(
   record: ResultManifestRecord,
   options?: { readonly runPath?: string },
@@ -780,7 +850,6 @@ function buildResultArtifactCatalog(
   addPointerArtifactCatalogEntry(entries, seen, trace, 'trace', options?.runPath);
   addPointerArtifactCatalogEntry(entries, seen, answer, 'answer', options?.runPath);
 
-  addDirectArtifactCatalogEntry(entries, seen, record.benchmark_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.summary_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.grading_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.timing_path, 'artifact');
@@ -792,6 +861,7 @@ function buildResultArtifactCatalog(
   addDirectArtifactCatalogEntry(entries, seen, recordWithTrace.trace_path, 'trace');
   addDirectArtifactCatalogEntry(entries, seen, record.eval_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.targets_path, 'artifact');
+  addTrialRunCatalogEntries(entries, seen, record);
 
   return entries;
 }
@@ -1028,6 +1098,118 @@ function stripHeavyFields(results: readonly EvaluationResult[]) {
   });
 }
 
+function readArtifactJsonObject(
+  baseDir: string,
+  relativePath: string | undefined,
+): Record<string, unknown> | undefined {
+  if (!relativePath) return undefined;
+  const resolved = resolveReadableRunArtifactFile(baseDir, relativePath);
+  if (!resolved.absolutePath) return undefined;
+  try {
+    const parsed = JSON.parse(readFileSync(resolved.absolutePath, 'utf8')) as unknown;
+    return parsed && typeof parsed === 'object' && !Array.isArray(parsed)
+      ? (parsed as Record<string, unknown>)
+      : undefined;
+  } catch {
+    return undefined;
+  }
+}
+
+function numberField(record: Record<string, unknown> | undefined, key: string): number | undefined {
+  const value = record?.[key];
+  return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+}
+
+function objectField(
+  record: Record<string, unknown> | undefined,
+  key: string,
+): Record<string, unknown> | undefined {
+  const value = record?.[key];
+  return value && typeof value === 'object' && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function caseTrialArtifactPath(
+  artifactDir: string | undefined,
+  runPath: string | undefined,
+  filePath: string,
+): string | undefined {
+  if (!artifactDir || !runPath) return undefined;
+  return path.posix.join(artifactDir, runPath, filePath);
+}
+
+function buildRepeatTrialReadModels(
+  baseDir: string,
+  record: ResultManifestRecord,
+): Array<Record<string, unknown>> | undefined {
+  if (!record.trials || record.trials.length === 0) return undefined;
+  const artifactDir = record.artifact_dir
+    ? normalizeArtifactRelativePath(record.artifact_dir)
+    : undefined;
+
+  return record.trials.map((trial) => {
+    const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined;
+    const metricsPath = caseTrialArtifactPath(artifactDir, runPath, 'metrics.json');
+    const timingPath = caseTrialArtifactPath(artifactDir, runPath, 'timing.json');
+    const gradingPath = caseTrialArtifactPath(artifactDir, runPath, 'grading.json');
+    const transcriptPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript-raw.jsonl');
+    const answerPath = caseTrialArtifactPath(artifactDir, runPath, 'outputs/answer.md');
+    const metrics = readArtifactJsonObject(baseDir, metricsPath);
+    const timing = readArtifactJsonObject(baseDir, timingPath);
+    const toolCalls = objectField(metrics, 'tool_calls');
+    const tokenUsage = objectField(timing, 'token_usage');
+
+    return {
+      ...trial,
+      ...(numberField(timing, 'duration_ms') !== undefined && {
+        duration_ms: numberField(timing, 'duration_ms'),
+      }),
+      ...(numberField(timing, 'total_tokens') !== undefined && {
+        total_tokens: numberField(timing, 'total_tokens'),
+      }),
+      ...(numberField(timing, 'cost_usd') !== undefined && {
+        cost_usd: numberField(timing, 'cost_usd'),
+      }),
+      ...(tokenUsage && { token_usage: tokenUsage }),
+      ...(numberField(metrics, 'total_tool_calls') !== undefined && {
+        total_tool_calls: numberField(metrics, 'total_tool_calls'),
+      }),
+      ...(toolCalls && { tool_calls: toolCalls }),
+      ...(metricsPath && { metrics_path: metricsPath }),
+      ...(timingPath && { timing_path: timingPath }),
+      ...(gradingPath && { grading_path: gradingPath }),
+      ...(transcriptPath && { transcript_path: transcriptPath }),
+      ...(answerPath && { answer_path: answerPath }),
+    };
+  });
+}
+
+function attachRunDetailReadModelFields<T extends Record<string, unknown>>(
+  results: readonly T[],
+  records: readonly ResultManifestRecord[],
+  baseDir: string,
+): T[] {
+  return results.map((result, index) => {
+    const record = records[index];
+    if (!record) return result;
+    const trials = buildRepeatTrialReadModels(baseDir, record);
+    return {
+      ...result,
+      ...(record.aggregation && { aggregation: record.aggregation }),
+      ...(record.artifact_dir && { artifact_dir: record.artifact_dir }),
+      ...(record.summary_path && { summary_path: record.summary_path }),
+      ...(record.grading_path && { grading_path: record.grading_path }),
+      ...(record.timing_path && { timing_path: record.timing_path }),
+      ...(record.metrics_path && { metrics_path: record.metrics_path }),
+      ...(record.transcript_path && { transcript_path: record.transcript_path }),
+      ...(record.output_path && { output_path: record.output_path }),
+      ...(record.answer_path && { answer_path: record.answer_path }),
+      ...(trials && { trials }),
+    };
+  });
+}
+
 // ── Shared data-route handlers ───────────────────────────────────────────
 //
 // Each handler takes a Hono Context and a DataContext (resolved directories).
@@ -1460,8 +1642,12 @@ async function handleRunDetail(c: C, { searchDir, projectId }: DataContext) {
     const resumeMeta = meta.source === 'local' ? deriveResumeMeta(searchDir, meta.path) : {};
     const liveStatus = meta.source === 'local' ? getActiveRunStatus(meta.path) : undefined;
     const tagFields = await readRunTagFields(searchDir, meta, projectId);
+    const baseDir = path.dirname(meta.path);
     return c.json({
-      results: attachExternalTraceFields(stripHeavyFields(loaded), records),
+      results: attachExternalTraceFields(
+        attachRunDetailReadModelFields(stripHeavyFields(loaded), records, baseDir),
+        records,
+      ),
       source: meta.source,
       source_label: meta.displayName,
       ...tagFields,
@@ -1509,7 +1695,7 @@ function attachExternalTraceFields<T extends Record<string, unknown>>(
 
 /**
  * Compute `run_dir` (relative to cwd, snake_case) and `suite_filter` (the
- * eval file path stored in benchmark.json metadata) for a local run manifest.
+ * eval file path stored in summary.json metadata) for a local run manifest.
  * Returns whatever fields could be resolved — both are best-effort and only
  * needed by the Dashboard "Resume run" / "Rerun failed" actions.
  */
@@ -1525,9 +1711,9 @@ function deriveResumeMeta(
   // dir as cwd) is unusual but valid — fall through to absolute in that case.
   out.run_dir = relative !== '' && !relative.startsWith('..') ? relative : runDir;
   try {
-    const benchmarkPath = path.join(runDir, 'benchmark.json');
-    if (existsSync(benchmarkPath)) {
-      const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
+    const summaryPath = path.join(runDir, 'summary.json');
+    if (existsSync(summaryPath)) {
+      const parsed = JSON.parse(readFileSync(summaryPath, 'utf8')) as {
         metadata?: { eval_file?: string; planned_test_count?: number };
       };
       const evalFile = parsed.metadata?.eval_file;
@@ -1540,7 +1726,7 @@ function deriveResumeMeta(
       }
     }
   } catch {
-    // benchmark.json missing / unreadable / malformed — leave fields unset.
+    // summary.json missing / unreadable / malformed — leave fields unset.
   }
   return out;
 }
@@ -1647,14 +1833,24 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D
 
 async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) {
   const filename = c.req.param('filename') ?? '';
-  const evalId = c.req.param('evalId');
+  const evalId = c.req.param('evalId') ?? '';
+  if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
+  const artifactDir = requestedArtifactDir(c);
+  if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
   const meta = await findRunById(searchDir, filename, projectId);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
     const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId);
-    const result = loaded.find((r) => r.testId === evalId);
-    if (!result) return c.json({ error: 'Eval not found' }, 404);
-    const [stripped] = stripHeavyFields([result]);
+    const records = await parseManifestForMeta(searchDir, meta, projectId);
+    const selection = manifestRecordSelection(records, evalId, artifactDir.value);
+    const result = selection ? loaded[selection.index] : undefined;
+    if (!selection || !result) return c.json({ error: 'Eval not found' }, 404);
+    const baseDir = path.dirname(meta.path);
+    const [stripped] = attachRunDetailReadModelFields(
+      stripHeavyFields([result]),
+      [selection.record],
+      baseDir,
+    );
     return c.json({ eval: stripped });
   } catch {
     return c.json({ error: 'Failed to load eval' }, 500);
@@ -1663,13 +1859,17 @@ async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) {
 
 async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) {
   const filename = c.req.param('filename') ?? '';
-  const evalId = c.req.param('evalId');
+  const evalId = c.req.param('evalId') ?? '';
+  if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
+  const artifactDir = requestedArtifactDir(c);
+  if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
   const meta = await findRunById(searchDir, filename, projectId);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
     const records = await parseManifestForMeta(searchDir, meta, projectId);
-    const record = records.find((r) => r.test_id === evalId);
-    if (!record) return c.json({ error: 'Eval not found' }, 404);
+    const selection = manifestRecordSelection(records, evalId, artifactDir.value);
+    if (!selection) return c.json({ error: 'Eval not found' }, 404);
+    const { record } = selection;
 
     const baseDir = path.dirname(meta.path);
     const catalog = buildResultArtifactCatalog(record, {
@@ -1686,7 +1886,10 @@ async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) {
 
 async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext) {
   const filename = c.req.param('filename') ?? '';
-  const evalId = c.req.param('evalId');
+  const evalId = c.req.param('evalId') ?? '';
+  if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
+  const artifactDir = requestedArtifactDir(c);
+  if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
   const meta = await findRunById(searchDir, filename, projectId);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
 
@@ -1705,8 +1908,9 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext
 
   await ensureRunReadable(searchDir, meta, projectId);
   const records = parseResultManifest(readFileSync(meta.path, 'utf8'));
-  const record = records.find((r) => r.test_id === evalId);
-  if (!record) return c.json({ error: 'Eval not found' }, 404);
+  const selection = manifestRecordSelection(records, evalId, artifactDir.value);
+  if (!selection) return c.json({ error: 'Eval not found' }, 404);
+  const { record } = selection;
   const catalog = buildResultArtifactCatalog(record, {
     runPath: relativeRunPathFromManifestPath(meta.path),
   });
@@ -1729,14 +1933,18 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext
 
 async function handleEvalTraceSession(c: C, { searchDir, projectId }: DataContext) {
   const filename = c.req.param('filename') ?? '';
-  const evalId = c.req.param('evalId');
+  const evalId = c.req.param('evalId') ?? '';
+  if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
+  const artifactDir = requestedArtifactDir(c);
+  if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
   const meta = await findRunById(searchDir, filename, projectId);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
 
   try {
     const records = await parseManifestForMeta(searchDir, meta, projectId);
-    const record = records.find((r) => r.test_id === evalId);
-    if (!record) return c.json({ error: 'Eval not found' }, 404);
+    const selection = manifestRecordSelection(records, evalId, artifactDir.value);
+    if (!selection) return c.json({ error: 'Eval not found' }, 404);
+    const { record } = selection;
 
     const trace = resolveRecordArtifactPointer(record, 'trace');
     const runPath = relativeRunPathFromManifestPath(meta.path);
@@ -1857,14 +2065,18 @@ async function handleEvalTraceSession(c: C, { searchDir, projectId }: DataContex
 
 async function handleEvalTranscript(c: C, { searchDir, projectId }: DataContext) {
   const filename = c.req.param('filename') ?? '';
-  const evalId = c.req.param('evalId');
+  const evalId = c.req.param('evalId') ?? '';
+  if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
+  const artifactDir = requestedArtifactDir(c);
+  if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
   const meta = await findRunById(searchDir, filename, projectId);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
 
   try {
     const records = await parseManifestForMeta(searchDir, meta, projectId);
-    const record = records.find((r) => r.test_id === evalId);
-    if (!record) return c.json({ error: 'Eval not found' }, 404);
+    const selection = manifestRecordSelection(records, evalId, artifactDir.value);
+    if (!selection) return c.json({ error: 'Eval not found' }, 404);
+    const { record } = selection;
 
     const transcript = resolveRecordArtifactPointer(record, 'transcript');
     const answer = resolveRecordArtifactPointer(record, 'answer');
diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts
index 680f5b0b4..cbd2f8679 100644
--- a/apps/cli/src/commands/results/validate.ts
+++ b/apps/cli/src/commands/results/validate.ts
@@ -5,9 +5,9 @@
  * Checks:
  *   1. Directory follows the `.agentv/results/<experiment>/<timestamp>` naming convention
  *   2. index.jsonl exists and each line has required fields
- *   3. Per-test grading.json exists for every entry in the index
- *   4. Per-test timing.json exists for direct case rows (warning if missing)
- *   5. benchmark.json exists (warning if missing)
+ *   3. Per-case summary.json exists for every entry in the index
+ *   4. Per-run result.json and grading.json exist for every materialized trial
+ *   5. summary.json exists
  *   6. Scores are within [0, 1]
  *   7. index.jsonl entries have `scores[]` array (warning if missing — dashboard needs it)
  *
@@ -34,10 +34,11 @@ interface IndexEntry {
   readonly target?: string;
   readonly scores?: unknown[];
   readonly execution_status?: string;
-  readonly benchmark_path?: string;
   readonly summary_path?: string;
   readonly grading_path?: string;
   readonly timing_path?: string;
+  readonly artifact_dir?: string;
+  readonly trials?: readonly { readonly run_path?: string }[];
   readonly [key: string]: unknown;
 }
 
@@ -141,10 +142,10 @@ function checkIndexJsonl(runDir: string): { diagnostics: Diagnostic[]; entries:
         });
       }
 
-      if (!entry.grading_path && !entry.benchmark_path) {
+      if (!entry.summary_path) {
         diagnostics.push({
-          severity: 'warning',
-          message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'grading_path' or 'benchmark_path'`,
+          severity: 'error',
+          message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'summary_path'`,
         });
       }
 
@@ -215,12 +216,23 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[]
       }
     }
 
-    if (entry.benchmark_path) {
-      const benchmarkPath = path.join(runDir, entry.benchmark_path);
-      if (!existsSync(benchmarkPath)) {
+    for (const trial of entry.trials ?? []) {
+      if (!entry.artifact_dir || !trial.run_path) {
+        continue;
+      }
+      const runDirPath = path.join(runDir, entry.artifact_dir, trial.run_path);
+      const resultPath = path.join(runDirPath, 'result.json');
+      const gradingPath = path.join(runDirPath, 'grading.json');
+      if (!existsSync(resultPath)) {
+        diagnostics.push({
+          severity: 'error',
+          message: `${testId}: result.json not found at '${path.posix.join(entry.artifact_dir, trial.run_path, 'result.json')}'`,
+        });
+      }
+      if (!existsSync(gradingPath)) {
         diagnostics.push({
           severity: 'error',
-          message: `${testId}: benchmark.json not found at '${entry.benchmark_path}'`,
+          message: `${testId}: grading.json not found at '${path.posix.join(entry.artifact_dir, trial.run_path, 'grading.json')}'`,
         });
       }
     }
@@ -269,10 +281,10 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[]
     }
   }
 
-  // Check benchmark.json
-  const benchmarkPath = path.join(runDir, 'benchmark.json');
-  if (!existsSync(benchmarkPath)) {
-    diagnostics.push({ severity: 'warning', message: 'benchmark.json is missing' });
+  // Check run summary.json
+  const summaryPath = path.join(runDir, 'summary.json');
+  if (!existsSync(summaryPath)) {
+    diagnostics.push({ severity: 'error', message: 'summary.json is missing' });
   }
 
   return diagnostics;
diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts
index 40b00d9a1..9ef6034ca 100644
--- a/apps/cli/test/commands/eval/aggregate.test.ts
+++ b/apps/cli/test/commands/eval/aggregate.test.ts
@@ -115,7 +115,7 @@ describe('aggregateRunDir', () => {
     rmSync(tmpDir, { recursive: true, force: true });
   });
 
-  it('reads index.jsonl, deduplicates, writes benchmark.json and timing.json', async () => {
+  it('reads index.jsonl, deduplicates, and writes summary.json with timing rollups', async () => {
     writeJsonlIndex(tmpDir, [
       { testId: 'a', target: 'x', score: 0.1, executionStatus: 'execution_error' },
       { testId: 'a', target: 'x', score: 0.9, executionStatus: 'ok' },
@@ -126,13 +126,11 @@ describe('aggregateRunDir', () => {
     expect(result.testCount).toBe(2);
     expect(result.targetCount).toBe(1);
 
-    const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8'));
-    expect(benchmark.metadata.tests_run).toContain('a');
-    expect(benchmark.metadata.tests_run).toContain('b');
-    expect(benchmark.run_summary.x).toBeDefined();
-
-    const timing = JSON.parse(readFileSync(result.timingPath, 'utf8'));
-    expect(timing.total_tokens).toBeGreaterThanOrEqual(0);
+    const summary = JSON.parse(readFileSync(result.summaryPath, 'utf8'));
+    expect(summary.metadata.tests_run).toContain('a');
+    expect(summary.metadata.tests_run).toContain('b');
+    expect(summary.run_summary.x).toBeDefined();
+    expect(summary.timing.total_tokens).toBeGreaterThanOrEqual(0);
   });
 
   it('uses last entry for duplicates in benchmark stats', async () => {
@@ -144,7 +142,7 @@ describe('aggregateRunDir', () => {
     const result = await aggregateRunDir(tmpDir);
     expect(result.testCount).toBe(1);
 
-    const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8'));
+    const benchmark = JSON.parse(readFileSync(result.summaryPath, 'utf8'));
     // Should have 100% pass rate since the last entry is ok with score 1.0
     expect(benchmark.run_summary.x.pass_rate.mean).toBe(1);
   });
@@ -181,13 +179,19 @@ describe('writePerTestArtifacts', () => {
 
     await writePerTestArtifacts(results, tmpDir);
 
-    const grading1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'grading.json'), 'utf8'));
+    const grading1 = JSON.parse(
+      readFileSync(path.join(tmpDir, 'test-1', 'run-1', 'grading.json'), 'utf8'),
+    );
     expect(grading1.assertions).toHaveLength(1);
 
-    const timing1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'timing.json'), 'utf8'));
+    const timing1 = JSON.parse(
+      readFileSync(path.join(tmpDir, 'test-1', 'run-1', 'timing.json'), 'utf8'),
+    );
     expect(timing1.total_tokens).toBeGreaterThanOrEqual(0);
 
-    const grading2 = JSON.parse(readFileSync(path.join(tmpDir, 'test-2', 'grading.json'), 'utf8'));
+    const grading2 = JSON.parse(
+      readFileSync(path.join(tmpDir, 'test-2', 'run-1', 'grading.json'), 'utf8'),
+    );
     expect(grading2.assertions).toHaveLength(1);
   });
 
@@ -196,7 +200,10 @@ describe('writePerTestArtifacts', () => {
 
     await writePerTestArtifacts(results, tmpDir);
 
-    const answer = readFileSync(path.join(tmpDir, 'test-1', 'outputs', 'answer.md'), 'utf8');
+    const answer = readFileSync(
+      path.join(tmpDir, 'test-1', 'run-1', 'outputs', 'answer.md'),
+      'utf8',
+    );
     expect(answer).toContain('hello');
   });
 });
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 3e2feac68..bf30802d4 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -26,14 +26,14 @@ import {
 
 import {
   type AggregateGradingArtifact,
-  type BenchmarkArtifact,
   type GradingArtifact,
   type IndexArtifactEntry,
+  type RunSummaryArtifact,
   type TimingArtifact,
   buildAggregateGradingArtifact,
-  buildBenchmarkArtifact,
   buildGradingArtifact,
   buildIndexArtifactEntry,
+  buildRunSummaryArtifact,
   buildTimingArtifact,
   parseJsonlResults,
   writeArtifacts,
@@ -336,7 +336,7 @@ describe('buildTimingArtifact', () => {
 // Benchmark artifact
 // ---------------------------------------------------------------------------
 
-describe('buildBenchmarkArtifact', () => {
+describe('buildRunSummaryArtifact', () => {
   it('computes per-target statistics', () => {
     const results = [
       makeResult({ target: 'gpt-4', score: 0.9, durationMs: 30000 }),
@@ -344,7 +344,7 @@ describe('buildBenchmarkArtifact', () => {
       makeResult({ target: 'claude', score: 0.5, durationMs: 45000 }),
     ];
 
-    const benchmark = buildBenchmarkArtifact(results, 'test.eval.yaml');
+    const benchmark = buildRunSummaryArtifact(results, 'test.eval.yaml');
 
     expect(benchmark.metadata.eval_file).toBe('test.eval.yaml');
     expect(benchmark.metadata.targets).toEqual(['claude', 'gpt-4']);
@@ -371,7 +371,7 @@ describe('buildBenchmarkArtifact', () => {
       }),
     ];
 
-    const benchmark = buildBenchmarkArtifact(results);
+    const benchmark = buildRunSummaryArtifact(results);
 
     expect(benchmark.per_grader_summary).toBeDefined();
     expect(benchmark.per_grader_summary?.['quality:llm-grader'].mean).toBe(0.8);
@@ -380,7 +380,7 @@ describe('buildBenchmarkArtifact', () => {
   it('adds note when execution errors present', () => {
     const results = [makeResult({ executionStatus: 'execution_error', score: 0 })];
 
-    const benchmark = buildBenchmarkArtifact(results);
+    const benchmark = buildRunSummaryArtifact(results);
     expect(benchmark.notes).toContain(
       '1 test(s) had execution errors and are excluded from quality pass_rate',
     );
@@ -401,14 +401,14 @@ describe('buildBenchmarkArtifact', () => {
       }),
     ];
 
-    const benchmark = buildBenchmarkArtifact(results);
+    const benchmark = buildRunSummaryArtifact(results);
 
     expect(benchmark.run_summary['test-target'].pass_rate.mean).toBe(1);
     expect(benchmark.per_grader_summary?.['quality:llm-grader'].mean).toBe(1);
   });
 
   it('handles empty results', () => {
-    const benchmark = buildBenchmarkArtifact([]);
+    const benchmark = buildRunSummaryArtifact([]);
 
     expect(benchmark.metadata.targets).toEqual([]);
     expect(benchmark.metadata.tests_run).toEqual([]);
@@ -418,7 +418,7 @@ describe('buildBenchmarkArtifact', () => {
   it('includes cost_usd when available', () => {
     const results = [makeResult({ costUsd: 0.05 }), makeResult({ testId: 'test-2', costUsd: 0.1 })];
 
-    const benchmark = buildBenchmarkArtifact(results);
+    const benchmark = buildRunSummaryArtifact(results);
     const summary = benchmark.run_summary['test-target'];
     expect(summary.cost_usd).toBeDefined();
     expect(summary.cost_usd?.mean).toBe(0.075);
@@ -576,11 +576,10 @@ describe('buildIndexArtifactEntry', () => {
       }),
       {
         outputDir: '/tmp/artifacts',
-        gradingPath: '/tmp/artifacts/alpha/grading.json',
-        timingPath: '/tmp/artifacts/alpha/timing.json',
+        gradingPath: '/tmp/artifacts/alpha/run-1/grading.json',
+        timingPath: '/tmp/artifacts/alpha/run-1/timing.json',
         outputPath: '/tmp/artifacts/alpha/outputs/answer.md',
         answerPath: '/tmp/artifacts/alpha/outputs/answer.md',
-        inputPath: '/tmp/artifacts/alpha/task/PROMPT.md',
       },
     );
 
@@ -608,11 +607,32 @@ describe('buildIndexArtifactEntry', () => {
       ],
       execution_status: 'quality_failure',
       error: 'model drift',
-      grading_path: 'alpha/grading.json',
-      timing_path: 'alpha/timing.json',
+      grading_path: 'alpha/run-1/grading.json',
+      timing_path: 'alpha/run-1/timing.json',
       output_path: 'alpha/outputs/answer.md',
       answer_path: 'alpha/outputs/answer.md',
-      input_path: 'alpha/task/PROMPT.md',
+      trials: [
+        {
+          attempt: 0,
+          run_path: 'run-1',
+          score: 0.9,
+          verdict: 'pass',
+          scores: [
+            {
+              name: 'quality',
+              type: 'llm-grader',
+              score: 0.7,
+              assertions: [
+                { text: 'criterion-a', passed: true },
+                { text: 'criterion-b', passed: false },
+              ],
+            },
+          ],
+          error: 'model drift',
+          cost_usd: 0.25,
+          execution_status: 'quality_failure',
+        },
+      ],
     });
   });
 
@@ -633,8 +653,8 @@ describe('buildIndexArtifactEntry', () => {
       }),
       {
         outputDir: '/tmp/artifacts',
-        gradingPath: '/tmp/artifacts/alpha/grading.json',
-        timingPath: '/tmp/artifacts/alpha/timing.json',
+        gradingPath: '/tmp/artifacts/alpha/run-1/grading.json',
+        timingPath: '/tmp/artifacts/alpha/run-1/timing.json',
       },
     );
 
@@ -698,9 +718,9 @@ describe('parseJsonlResults', () => {
       artifactPointers: {
         transcript: {
           ref: 'agentv/artifacts/v1',
-          key: 'transcripts/pointer-row/transcript.jsonl',
+          key: 'transcripts/pointer-row/run-1/transcript-raw.jsonl',
           object_version: 'sha256:test',
-          path: 'pointer-row/transcript.jsonl',
+          path: 'pointer-row/run-1/transcript-raw.jsonl',
           sha256: 'test',
           size: 1,
           schema_version: 'agentv.transcript.v1',
@@ -719,7 +739,7 @@ describe('parseJsonlResults', () => {
       target: 'codex',
       score: 1,
       output: 'done',
-      raw_provider_log_path: 'raw-log-case/provider.log',
+      raw_provider_log_path: 'raw-log-case/run-1/provider.log',
     })}\n`;
 
     const results = parseJsonlResults(content);
@@ -815,7 +835,7 @@ describe('schema compatibility', () => {
   });
 
   it('benchmark run_summary has pass_rate/time_seconds/tokens with mean/stddev', () => {
-    const benchmark = buildBenchmarkArtifact([makeResult({})]);
+    const benchmark = buildRunSummaryArtifact([makeResult({})]);
     const summary = benchmark.run_summary['test-target'];
 
     expect(summary).toBeDefined();
@@ -843,7 +863,7 @@ describe('writeArtifactsFromResults', () => {
     await rm(testDir, { recursive: true, force: true }).catch(() => undefined);
   });
 
-  it('writes grading, timing, and benchmark files', async () => {
+  it('writes summary, index, and per-run artifact files', async () => {
     const results = [
       makeResult({ testId: 'alpha', score: 0.9, durationMs: 5000 }),
       makeResult({ testId: 'beta', score: 0.6, durationMs: 8000 }),
@@ -855,54 +875,47 @@ describe('writeArtifactsFromResults', () => {
 
     // Check per-test artifact directories
     const artifactEntries = await readdir(paths.testArtifactDir);
-    expect(artifactEntries.sort()).toEqual([
-      'alpha',
-      'benchmark.json',
-      'beta',
-      'index.jsonl',
-      'timing.json',
-    ]);
+    expect(artifactEntries.sort()).toEqual(['alpha', 'beta', 'index.jsonl', 'summary.json']);
 
     const alphaEntries = await readdir(path.join(paths.testArtifactDir, 'alpha'));
-    expect(alphaEntries.sort()).toEqual([
+    expect(alphaEntries.sort()).toEqual(['run-1', 'summary.json']);
+
+    const alphaRunEntries = await readdir(path.join(paths.testArtifactDir, 'alpha', 'run-1'));
+    expect(alphaRunEntries.sort()).toEqual([
       'grading.json',
       'metrics.json',
       'outputs',
+      'result.json',
       'timing.json',
-      'trace.json',
-      'transcript.jsonl',
+      'transcript-raw.jsonl',
+      'transcript.json',
     ]);
 
     const alphaGrading: GradingArtifact = JSON.parse(
-      await readFile(path.join(paths.testArtifactDir, 'alpha', 'grading.json'), 'utf8'),
+      await readFile(path.join(paths.testArtifactDir, 'alpha', 'run-1', 'grading.json'), 'utf8'),
     );
     expect(alphaGrading.summary).toBeDefined();
     expect(alphaGrading).not.toHaveProperty('execution_metrics');
 
     const alphaTiming: TimingArtifact = JSON.parse(
-      await readFile(path.join(paths.testArtifactDir, 'alpha', 'timing.json'), 'utf8'),
+      await readFile(path.join(paths.testArtifactDir, 'alpha', 'run-1', 'timing.json'), 'utf8'),
     );
     expect(alphaTiming.duration_ms).toBe(5000);
 
-    // Check timing
-    const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
-    expect(timing.duration_ms).toBe(13000);
-
-    // Check benchmark
-    const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8'));
-    expect(benchmark.metadata.eval_file).toBe('my-eval.yaml');
-    expect(benchmark.metadata.tests_run.sort()).toEqual(['alpha', 'beta']);
+    const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
+    expect(summary.metadata.eval_file).toBe('my-eval.yaml');
+    expect(summary.metadata.tests_run.sort()).toEqual(['alpha', 'beta']);
+    expect(summary.timing.duration_ms).toBe(13000);
 
     const indexLines = (await readFile(paths.indexPath, 'utf8'))
       .trim()
       .split('\n')
       .map((line) => JSON.parse(line) as IndexArtifactEntry);
     expect(indexLines).toHaveLength(2);
-    expect(indexLines[0]?.grading_path).toBe('alpha/grading.json');
-    expect(indexLines[0]?.timing_path).toBe('alpha/timing.json');
-    expect(indexLines[0]?.trace_path).toBe('alpha/trace.json');
-    expect(indexLines[0]?.transcript_path).toBe('alpha/transcript.jsonl');
-    expect(indexLines[0]?.metrics_path).toBe('alpha/metrics.json');
+    expect(indexLines[0]?.summary_path).toBe('alpha/summary.json');
+    expect(indexLines[0]?.grading_path).toBe('alpha/run-1/grading.json');
+    expect(indexLines[0]?.timing_path).toBe('alpha/run-1/timing.json');
+    expect(indexLines[0]?.metrics_path).toBe('alpha/run-1/metrics.json');
   });
 
   it('writes repeat runs in Vercel-compatible case and run folders', async () => {
@@ -977,27 +990,14 @@ describe('writeArtifactsFromResults', () => {
     });
     expect(indexEntry?.artifact_dir).toBe('repeat-case');
     expect(indexEntry?.summary_path).toBe('repeat-case/summary.json');
-    expect(indexEntry?.task_dir).toBe('repeat-case/task');
-    expect(indexEntry?.input_path).toBe('repeat-case/task/PROMPT.md');
-    expect(indexEntry?.benchmark_path).toBeUndefined();
-    expect(indexEntry?.grading_path).toBe('repeat-case/grading.json');
+    expect(indexEntry?.task_dir).toBeUndefined();
+    expect(indexEntry?.input_path).toBeUndefined();
+    expect(indexEntry?.grading_path).toBeUndefined();
     expect(indexEntry?.timing_path).toBeUndefined();
     expect(indexEntry?.metrics_path).toBeUndefined();
 
     const repeatEntries = await readdir(path.join(paths.testArtifactDir, 'repeat-case'));
-    expect(repeatEntries.sort()).toEqual([
-      'grading.json',
-      'run-1',
-      'run-2',
-      'summary.json',
-      'task',
-    ]);
-
-    const prompt = await readFile(
-      path.join(paths.testArtifactDir, 'repeat-case', 'task', 'PROMPT.md'),
-      'utf8',
-    );
-    expect(prompt).toBe('@[user]:\nRepeat this task prompt.');
+    expect(repeatEntries.sort()).toEqual(['run-1', 'run-2', 'summary.json']);
 
     const caseSummary = JSON.parse(
       await readFile(path.join(paths.testArtifactDir, 'repeat-case', 'summary.json'), 'utf8'),
@@ -1031,18 +1031,18 @@ describe('writeArtifactsFromResults', () => {
     });
     expect(typeof caseSummary.fingerprint).toBe('string');
 
-    const aggregateGrading: GradingArtifact = JSON.parse(
-      await readFile(path.join(paths.testArtifactDir, 'repeat-case', 'grading.json'), 'utf8'),
-    );
-    expect(aggregateGrading.trials).toEqual(indexEntry?.trials);
-    expect(aggregateGrading.aggregation).toEqual(indexEntry?.aggregation);
+    await expect(
+      readFile(path.join(paths.testArtifactDir, 'repeat-case', 'grading.json'), 'utf8'),
+    ).rejects.toThrow();
 
     for (const runDir of ['run-1', 'run-2']) {
       const runEntries = await readdir(path.join(paths.testArtifactDir, 'repeat-case', runDir));
       expect(runEntries.sort()).toEqual([
         'grading.json',
+        'metrics.json',
         'outputs',
         'result.json',
+        'timing.json',
         'transcript-raw.jsonl',
         'transcript.json',
       ]);
@@ -1094,13 +1094,11 @@ describe('writeArtifactsFromResults', () => {
     const paths = await writeArtifactsFromResults([], testDir);
 
     const artifactEntries = await readdir(paths.testArtifactDir);
-    expect(artifactEntries.sort()).toEqual(['benchmark.json', 'index.jsonl', 'timing.json']);
+    expect(artifactEntries.sort()).toEqual(['index.jsonl', 'summary.json']);
 
-    const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
-    expect(timing.total_tokens).toBe(0);
-
-    const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8'));
-    expect(benchmark.notes).toContain('No results to summarize');
+    const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
+    expect(summary.notes).toContain('No results to summarize');
+    expect(summary.timing.total_tokens).toBe(0);
     expect(await readFile(paths.indexPath, 'utf8')).toBe('');
   });
 
@@ -1122,13 +1120,13 @@ describe('writeArtifactsFromResults', () => {
     await writeArtifactsFromResults(results, testDir);
 
     const gradingOne: GradingArtifact = JSON.parse(
-      await readFile(path.join(testDir, 'test-1', 'grading.json'), 'utf8'),
+      await readFile(path.join(testDir, 'test-1', 'run-1', 'grading.json'), 'utf8'),
     );
     const gradingTwo: GradingArtifact = JSON.parse(
-      await readFile(path.join(testDir, 'test-2', 'grading.json'), 'utf8'),
+      await readFile(path.join(testDir, 'test-2', 'run-1', 'grading.json'), 'utf8'),
     );
     const timingOne: TimingArtifact = JSON.parse(
-      await readFile(path.join(testDir, 'test-1', 'timing.json'), 'utf8'),
+      await readFile(path.join(testDir, 'test-1', 'run-1', 'timing.json'), 'utf8'),
     );
 
     expect(gradingOne.summary.total).toBe(1);
@@ -1179,22 +1177,18 @@ describe('writeArtifactsFromResults', () => {
 
     await writeArtifactsFromResults(results, testDir);
 
-    const transcriptPath = path.join(testDir, 'transcript-case', 'transcript.jsonl');
+    const transcriptPath = path.join(testDir, 'transcript-case', 'run-1', 'transcript-raw.jsonl');
     const transcriptLines = (await readFile(transcriptPath, 'utf8'))
       .trim()
       .split('\n')
       .map((line) => JSON.parse(line));
 
-    const envelope = TraceEnvelopeWireSchema.parse(
-      JSON.parse(await readFile(path.join(testDir, 'transcript-case', 'trace.json'), 'utf8')),
+    const transcriptMessages = JSON.parse(
+      await readFile(path.join(testDir, 'transcript-case', 'run-1', 'transcript.json'), 'utf8'),
     );
-    const projectedEnvelope = fromTraceEnvelopeWire(envelope);
-    const projectedTranscript = traceEnvelopeToTranscriptJsonLines(projectedEnvelope, {
-      testId: 'transcript-case',
-      target: 'codex',
-    });
 
-    expect(transcriptLines).toEqual(JSON.parse(JSON.stringify(projectedTranscript)));
+    expect(Array.isArray(transcriptMessages)).toBe(true);
+    expect(transcriptMessages).toHaveLength(2);
     expect(transcriptLines).toHaveLength(2);
     expect(transcriptLines[0]).toMatchObject({
       schema_version: 'agentv.transcript.v1',
@@ -1209,9 +1203,9 @@ describe('writeArtifactsFromResults', () => {
       capture: { content: 'full', redaction_level: 'none', redacted_fields: [] },
       trace: {
         schema_version: 'agentv.trace.v1',
-        artifact_id: envelope.artifact_id,
-        trace_id: envelope.trace.trace_id,
-        span_id: envelope.trace.root_span_id,
+        artifact_id: expect.any(String),
+        trace_id: expect.any(String),
+        span_id: expect.any(String),
       },
       source: {
         kind: 'agentv_run',
@@ -1242,8 +1236,8 @@ describe('writeArtifactsFromResults', () => {
           status: 'ok',
           trace: {
             schema_version: 'agentv.trace.v1',
-            artifact_id: envelope.artifact_id,
-            trace_id: envelope.trace.trace_id,
+            artifact_id: expect.any(String),
+            trace_id: expect.any(String),
           },
         },
       ],
@@ -1257,18 +1251,6 @@ describe('writeArtifactsFromResults', () => {
     expect(transcriptLines[1].tool_calls[0].trace.span_id).toBeTruthy();
     expect(transcriptLines[1]).not.toHaveProperty('provider_session_id');
     expect(transcriptLines[1]).not.toHaveProperty('providerSessionId');
-    expect(envelope.schema_version).toBe('agentv.trace.v1');
-    expect(envelope.artifact_id).toMatch(/^execution-trace-/);
-    expect(envelope.artifacts.trace_path).toBe(CANONICAL_TRACE_ARTIFACT_PATH);
-    expect(envelope.artifacts.transcript_path).toBe(CANONICAL_TRANSCRIPT_ARTIFACT_PATH);
-    expect(envelope.artifacts.metrics_path).toBe(CANONICAL_METRICS_ARTIFACT_PATH);
-    expect(envelope.artifacts).not.toHaveProperty('execution_trace_path');
-    expect(envelope.eval.test_id).toBe('transcript-case');
-    expect(envelope.trace.spans.map((span) => span.attributes['gen_ai.operation.name'])).toEqual([
-      'invoke_agent',
-      'chat',
-      'execute_tool',
-    ]);
     await expect(
       readFile(path.join(testDir, 'transcript-case', 'transcript.json'), 'utf8'),
     ).rejects.toThrow();
@@ -1276,40 +1258,12 @@ describe('writeArtifactsFromResults', () => {
     const indexLine = JSON.parse(
       (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(),
     );
-    expect(indexLine.trace_path).toBe('transcript-case/trace.json');
-    expect(indexLine.transcript_path).toBe('transcript-case/transcript.jsonl');
-    expect(indexLine.transcript_path.endsWith(CANONICAL_TRANSCRIPT_ARTIFACT_PATH)).toBe(true);
-    expect(indexLine.metrics_path).toBe('transcript-case/metrics.json');
+    expect(indexLine).not.toHaveProperty('trace_path');
+    expect(indexLine.transcript_path).toBe('transcript-case/run-1/transcript-raw.jsonl');
+    expect(indexLine.metrics_path).toBe('transcript-case/run-1/metrics.json');
     expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true);
 
-    const traceContent = await readFile(path.join(testDir, 'transcript-case', 'trace.json'));
-    const transcriptContent = await readFile(transcriptPath);
-    const traceSha = sha256Hex(traceContent);
-    const transcriptSha = sha256Hex(transcriptContent);
-
-    expect(indexLine.artifact_pointers.trace).toMatchObject({
-      ref: AGENTV_RESULTS_ARTIFACTS_REF,
-      key: 'traces/transcript-case/trace.json',
-      object_version: `sha256:${traceSha}`,
-      path: 'transcript-case/trace.json',
-      sha256: traceSha,
-      size: traceContent.byteLength,
-      schema_version: EXECUTION_TRACE_SCHEMA_VERSION,
-      media_type: TRACE_JSON_MEDIA_TYPE,
-      family: 'traces',
-    });
-    expect(indexLine.artifact_pointers.transcript).toMatchObject({
-      ref: AGENTV_RESULTS_ARTIFACTS_REF,
-      key: 'transcripts/transcript-case/transcript.jsonl',
-      object_version: `sha256:${transcriptSha}`,
-      path: 'transcript-case/transcript.jsonl',
-      sha256: transcriptSha,
-      size: transcriptContent.byteLength,
-      schema_version: TRANSCRIPT_SCHEMA_VERSION,
-      media_type: TRANSCRIPT_JSONL_MEDIA_TYPE,
-      family: 'transcripts',
-    });
-    expect(indexLine.artifact_pointers).not.toHaveProperty('metrics');
+    expect(indexLine.artifact_pointers).toBeUndefined();
   });
 
   it('writes AgentV metrics as Agent Skills and Vercel-style behavior projections', async () => {
@@ -1406,16 +1360,18 @@ describe('writeArtifactsFromResults', () => {
     const indexLine = JSON.parse(
       (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(),
     );
-    expect(indexLine.metrics_path).toBe('summary-case/metrics.json');
+    expect(indexLine.metrics_path).toBe('summary-case/run-1/metrics.json');
 
     const summary = MetricsArtifactWireSchema.parse(
-      JSON.parse(await readFile(path.join(testDir, 'summary-case', 'metrics.json'), 'utf8')),
+      JSON.parse(
+        await readFile(path.join(testDir, 'summary-case', 'run-1', 'metrics.json'), 'utf8'),
+      ),
     );
 
     expect(summary.schema_version).toBe(METRICS_SCHEMA_VERSION);
     expect(summary.source_artifacts).toMatchObject({
-      trace_path: CANONICAL_TRACE_ARTIFACT_PATH,
-      transcript_path: CANONICAL_TRANSCRIPT_ARTIFACT_PATH,
+      trace_path: 'transcript.json',
+      transcript_path: 'transcript-raw.jsonl',
       grading_path: 'grading.json',
       timing_path: 'timing.json',
     });
@@ -1490,7 +1446,7 @@ describe('writeArtifactsFromResults', () => {
     expect(summary).not.toHaveProperty('usage_summary');
 
     const timing = JSON.parse(
-      await readFile(path.join(testDir, 'summary-case', 'timing.json'), 'utf8'),
+      await readFile(path.join(testDir, 'summary-case', 'run-1', 'timing.json'), 'utf8'),
     );
     expect(timing).toMatchObject({
       total_tokens: 140,
@@ -1545,18 +1501,22 @@ describe('writeArtifactsFromResults', () => {
     await writeArtifactsFromResults(results, testDir);
 
     const aggregateTiming = JSON.parse(
-      await readFile(path.join(testDir, 'aggregate-usage', 'timing.json'), 'utf8'),
+      await readFile(path.join(testDir, 'aggregate-usage', 'run-1', 'timing.json'), 'utf8'),
     );
     const estimatedTiming = JSON.parse(
-      await readFile(path.join(testDir, 'estimated-usage', 'timing.json'), 'utf8'),
+      await readFile(path.join(testDir, 'estimated-usage', 'run-1', 'timing.json'), 'utf8'),
     );
-    const runTiming = JSON.parse(await readFile(path.join(testDir, 'timing.json'), 'utf8'));
+    const runSummary = JSON.parse(await readFile(path.join(testDir, 'summary.json'), 'utf8'));
 
     MetricsArtifactWireSchema.parse(
-      JSON.parse(await readFile(path.join(testDir, 'aggregate-usage', 'metrics.json'), 'utf8')),
+      JSON.parse(
+        await readFile(path.join(testDir, 'aggregate-usage', 'run-1', 'metrics.json'), 'utf8'),
+      ),
     );
     MetricsArtifactWireSchema.parse(
-      JSON.parse(await readFile(path.join(testDir, 'estimated-usage', 'metrics.json'), 'utf8')),
+      JSON.parse(
+        await readFile(path.join(testDir, 'estimated-usage', 'run-1', 'metrics.json'), 'utf8'),
+      ),
     );
 
     expect(aggregateTiming).toMatchObject({
@@ -1581,7 +1541,7 @@ describe('writeArtifactsFromResults', () => {
         duration: 'unavailable',
       },
     });
-    expect(runTiming).toMatchObject({
+    expect(runSummary.timing).toMatchObject({
       total_tokens: 20,
       cost_usd: 0.002,
       usage_sources: {
@@ -1614,10 +1574,10 @@ describe('writeArtifactsFromResults', () => {
 
     await writeArtifactsFromResults(results, testDir);
 
-    const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'provider.log');
+    const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'run-1', 'provider.log');
     expect(await readFile(copiedRawLogPath, 'utf8')).toBe(rawLog);
 
-    const transcriptPath = path.join(testDir, 'raw-log-case', 'transcript.jsonl');
+    const transcriptPath = path.join(testDir, 'raw-log-case', 'run-1', 'transcript-raw.jsonl');
     await expect(readFile(transcriptPath, 'utf8')).resolves.toContain(
       '"schema_version":"agentv.transcript.v1"',
     );
@@ -1625,17 +1585,16 @@ describe('writeArtifactsFromResults', () => {
       readFile(path.join(testDir, 'raw-log-case', 'transcript.json'), 'utf8'),
     ).rejects.toThrow();
 
-    const envelope = TraceEnvelopeWireSchema.parse(
-      JSON.parse(await readFile(path.join(testDir, 'raw-log-case', 'trace.json'), 'utf8')),
+    const transcriptMessages = JSON.parse(
+      await readFile(path.join(testDir, 'raw-log-case', 'run-1', 'transcript.json'), 'utf8'),
     );
-    expect(envelope.artifacts.raw_provider_log_path).toBe('provider.log');
-    expect(envelope.artifacts.transcript_path).toBe('transcript.jsonl');
+    expect(Array.isArray(transcriptMessages)).toBe(true);
 
     const indexLine = JSON.parse(
       (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(),
     );
-    expect(indexLine.raw_provider_log_path).toBe('raw-log-case/provider.log');
-    expect(indexLine.transcript_path).toBe('raw-log-case/transcript.jsonl');
+    expect(indexLine.raw_provider_log_path).toBeUndefined();
+    expect(indexLine.transcript_path).toBe('raw-log-case/run-1/transcript-raw.jsonl');
     expect(indexLine).not.toHaveProperty('transcript_json_path');
   });
 
@@ -1681,13 +1640,12 @@ describe('writeArtifactsFromResults', () => {
     expect(JSON.stringify(indexLine)).not.toContain('secret');
     expect(JSON.stringify(indexLine)).not.toContain('api_key');
 
-    const envelope = TraceEnvelopeWireSchema.parse(
-      JSON.parse(await readFile(path.join(testDir, 'external-trace-case', 'trace.json'), 'utf8')),
+    const transcriptJson = await readFile(
+      path.join(testDir, 'external-trace-case', 'run-1', 'transcript.json'),
+      'utf8',
     );
-    expect(envelope.external_trace).toEqual(indexLine.external_trace);
-    expect(envelope.source.metadata ?? {}).not.toHaveProperty('external_trace');
-    expect(JSON.stringify(envelope)).not.toContain('secret');
-    expect(JSON.stringify(envelope)).not.toContain('api_key');
+    expect(transcriptJson).not.toContain('secret');
+    expect(transcriptJson).not.toContain('api_key');
   });
 
   it('omits per-test transcript links when the execution trace has no transcript rows', async () => {
@@ -1701,30 +1659,20 @@ describe('writeArtifactsFromResults', () => {
 
     await writeArtifactsFromResults(results, testDir);
 
-    const transcriptPath = path.join(testDir, 'no-transcript-case', 'transcript.jsonl');
+    const transcriptPath = path.join(
+      testDir,
+      'no-transcript-case',
+      'run-1',
+      'transcript-raw.jsonl',
+    );
     await expect(readFile(transcriptPath, 'utf8')).rejects.toThrow();
 
     const indexLine = JSON.parse(
       (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(),
     );
     expect(indexLine).not.toHaveProperty('transcript_path');
-    expect(indexLine.metrics_path).toBe('no-transcript-case/metrics.json');
-    expect(indexLine.artifact_pointers.trace).toMatchObject({
-      ref: AGENTV_RESULTS_ARTIFACTS_REF,
-      key: 'traces/no-transcript-case/trace.json',
-      path: 'no-transcript-case/trace.json',
-      schema_version: EXECUTION_TRACE_SCHEMA_VERSION,
-      media_type: TRACE_JSON_MEDIA_TYPE,
-      family: 'traces',
-    });
-    expect(indexLine.artifact_pointers).not.toHaveProperty('transcript');
-    expect(indexLine.artifact_pointers).not.toHaveProperty('metrics');
-
-    const envelope = TraceEnvelopeWireSchema.parse(
-      JSON.parse(await readFile(path.join(testDir, 'no-transcript-case', 'trace.json'), 'utf8')),
-    );
-    expect(envelope.artifacts).not.toHaveProperty('transcript_path');
-    expect(envelope.artifacts.metrics_path).toBe(CANONICAL_METRICS_ARTIFACT_PATH);
+    expect(indexLine.metrics_path).toBe('no-transcript-case/run-1/metrics.json');
+    expect(indexLine.artifact_pointers).toBeUndefined();
   });
 
   it('sanitizes test IDs for directory names', async () => {
@@ -1749,10 +1697,10 @@ describe('writeArtifactsFromResults', () => {
     const paths = await writeArtifactsFromResults(results, testDir);
     const indexLines = (await readFile(paths.indexPath, 'utf8')).trim().split('\n').map(JSON.parse);
 
-    expect(indexLines[0].grading_path).toBe('shared-id/grading.json');
+    expect(indexLines[0].grading_path).toBe('shared-id/run-1/grading.json');
 
     const grading: GradingArtifact = JSON.parse(
-      await readFile(path.join(testDir, 'shared-id', 'grading.json'), 'utf8'),
+      await readFile(path.join(testDir, 'shared-id', 'run-1', 'grading.json'), 'utf8'),
     );
 
     expect(grading.assertions[0].text).toBe('baseline-check');
@@ -1768,7 +1716,7 @@ describe('writeArtifactsFromResults', () => {
       .trim()
       .split('\n')
       .map(JSON.parse);
-    expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/grading.json');
+    expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json');
   });
 
   it('writes task bundle artifacts with local source paths when source metadata is provided', async () => {
@@ -2078,8 +2026,8 @@ describe('writeArtifacts (from JSONL file)', () => {
     expect(artifactEntries).toContain('from-file');
     expect(artifactEntries).toContain('index.jsonl');
 
-    const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
-    expect(timing.duration_ms).toBe(12000);
-    expect(timing.total_tokens).toBe(700);
+    const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
+    expect(summary.timing.duration_ms).toBe(12000);
+    expect(summary.timing.total_tokens).toBe(700);
   });
 });
diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts
index 8514f5a96..35ebec80e 100644
--- a/apps/cli/test/commands/eval/pipeline/bench.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts
@@ -85,12 +85,12 @@ describe('pipeline bench', () => {
     expect(lines[0].test_id).toBe('test-01');
     expect(lines[0].score).toBeGreaterThan(0);
 
-    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
+    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'summary.json'), 'utf8'));
     expect(benchmark.metadata.targets).toContain('test-target');
     expect(benchmark.run_summary['test-target']).toBeDefined();
   }, 30_000);
 
-  it('propagates experiment from manifest to index.jsonl and benchmark.json', async () => {
+  it('propagates experiment from manifest to index.jsonl and summary.json', async () => {
     // Overwrite manifest with experiment field
     await writeFile(
       join(OUT_DIR, 'manifest.json'),
@@ -110,7 +110,7 @@ describe('pipeline bench', () => {
     const entry = JSON.parse(indexContent.trim().split('\n')[0]);
     expect(entry.experiment).toBe('without_skills');
 
-    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
+    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'summary.json'), 'utf8'));
     expect(benchmark.metadata.experiment).toBe('without_skills');
   }, 30_000);
 
@@ -122,7 +122,7 @@ describe('pipeline bench', () => {
     const entry = JSON.parse(indexContent.trim().split('\n')[0]);
     expect(entry.experiment).toBeUndefined();
 
-    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
+    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'summary.json'), 'utf8'));
     expect(benchmark.metadata.experiment).toBeUndefined();
   }, 30_000);
 });
diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
index a2e695859..fef9a62cb 100644
--- a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
@@ -69,7 +69,7 @@ describe('eval pipeline e2e', () => {
       expect(indexLines).toHaveLength(1);
       expect(indexLines[0].test_id).toBe('test-01');
 
-      const benchmark = JSON.parse(await readFile(join(outDir, 'benchmark.json'), 'utf8'));
+      const benchmark = JSON.parse(await readFile(join(outDir, 'summary.json'), 'utf8'));
       expect(benchmark.run_summary).toBeDefined();
     },
     PIPELINE_E2E_TIMEOUT_MS,
diff --git a/apps/cli/test/commands/results/combine.test.ts b/apps/cli/test/commands/results/combine.test.ts
index 618a628c9..bed14923f 100644
--- a/apps/cli/test/commands/results/combine.test.ts
+++ b/apps/cli/test/commands/results/combine.test.ts
@@ -90,7 +90,7 @@ describe('results combine', () => {
     expect(
       existsSync(path.join(combined.runDir, 'sources/source-1/demo/test-a/grading.json')),
     ).toBe(true);
-    const benchmark = JSON.parse(readFileSync(combined.benchmarkPath, 'utf8')) as {
+    const benchmark = JSON.parse(readFileSync(combined.summaryPath, 'utf8')) as {
       metadata: { timestamp: string };
     };
     expect(benchmark.metadata.timestamp).toBe('2026-06-01T10:00:00.000Z');
diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts
index cb1b2eee6..46f1a2c58 100644
--- a/apps/cli/test/commands/results/export-e2e-providers.test.ts
+++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts
@@ -11,8 +11,8 @@ import { tmpdir } from 'node:os';
 import path from 'node:path';
 
 import type {
-  BenchmarkArtifact,
   GradingArtifact,
+  RunSummaryArtifact,
   TimingArtifact,
 } from '../../../src/commands/eval/artifact-writer.js';
 import { exportResults } from '../../../src/commands/results/export.js';
@@ -215,6 +215,10 @@ function artifactDir(outputDir: string, record: { suite?: string; test_id?: stri
   return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId);
 }
 
+function runArtifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string {
+  return path.join(artifactDir(outputDir, record), 'run-1');
+}
+
 describe('export e2e — multi-provider metrics verification', () => {
   let tempDir: string;
 
@@ -236,7 +240,10 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const timing: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          'utf8',
+        ),
       );
 
       expect(timing.token_usage.input).toBe(2000);
@@ -251,13 +258,16 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const claudeTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          'utf8',
+        ),
       );
       const codexTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'),
       );
       const copilotTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'),
       );
 
       expect(claudeTiming.token_usage.reasoning).toBe(1500);
@@ -272,7 +282,10 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const timing: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          'utf8',
+        ),
       );
 
       expect(timing.total_tokens).toBe(2800);
@@ -285,7 +298,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const timing: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'),
       );
 
       expect(timing.duration_ms).toBe(12000);
@@ -299,7 +312,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const timing: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, MINIMAL_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'timing.json'), 'utf8'),
       );
 
       expect(timing.total_tokens).toBe(0);
@@ -316,10 +329,13 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const claudeTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          'utf8',
+        ),
       );
       const copilotTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'),
       );
 
       expect(claudeTiming.token_usage.reasoning).toBe(1500);
@@ -329,7 +345,7 @@ describe('export e2e — multi-provider metrics verification', () => {
 
   // ── Benchmark artifact tests ───────────────────────────────────────────
 
-  describe('benchmark.json — per-target summary', () => {
+  describe('summary.json — per-target summary', () => {
     it('should group results by target with correct pass rates', async () => {
       const outputDir = path.join(tempDir, 'benchmark');
       const content = toJsonl(
@@ -343,8 +359,8 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       // All 6 targets should be represented
@@ -362,8 +378,8 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       // claude: 8500ms = 8.5s
@@ -378,8 +394,8 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       // claude: 2000 + 800 = 2800
@@ -394,8 +410,8 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       expect(benchmark.run_summary['claude-cli'].cost_usd).toBeDefined();
@@ -409,8 +425,8 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       // Claude has 3 tool calls in trace steps
@@ -424,8 +440,8 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       expect(benchmark.notes.length).toBeGreaterThan(0);
@@ -438,8 +454,8 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       expect(benchmark.per_grader_summary).toBeDefined();
@@ -456,7 +472,10 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const grading: GradingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json'), 'utf8'),
+        readFileSync(
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json'),
+          'utf8',
+        ),
       );
 
       expect(grading.assertions).toHaveLength(2);
@@ -467,7 +486,10 @@ describe('export e2e — multi-provider metrics verification', () => {
       expect(grading.summary.pass_rate).toBe(1.0);
 
       const metrics = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'), 'utf8'),
+        readFileSync(
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'),
+          'utf8',
+        ),
       );
       expect(metrics.metrics.total_tool_calls).toBe(3);
       expect(metrics.metrics.tool_call_counts.Read).toBe(2);
@@ -485,7 +507,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const grading: GradingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'grading.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'grading.json'), 'utf8'),
       );
 
       expect(grading.summary.passed).toBe(1);
@@ -493,7 +515,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       expect(grading.summary.pass_rate).toBe(0.5);
 
       const metrics = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'),
       );
       expect(metrics.metrics.total_tool_calls).toBe(0);
     });
@@ -505,14 +527,14 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const grading: GradingArtifact = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'grading.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'grading.json'), 'utf8'),
       );
 
       // Error result has empty assertions
       expect(grading.summary.total).toBe(0);
       expect(grading.summary.pass_rate).toBe(0);
       const metrics = JSON.parse(
-        readFileSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'metrics.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'metrics.json'), 'utf8'),
       );
       expect(metrics.metrics.errors_encountered).toBe(1);
     });
@@ -523,10 +545,10 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       await exportResults('test.jsonl', content, outputDir);
 
-      expect(existsSync(path.join(artifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json'))).toBe(
-        true,
-      );
-      expect(existsSync(path.join(artifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe(
+      expect(
+        existsSync(path.join(runArtifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json')),
+      ).toBe(true);
+      expect(existsSync(path.join(runArtifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe(
         true,
       );
     });
@@ -543,21 +565,21 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       expect(
         readFileSync(
-          path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'answer.md'),
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'answer.md'),
           'utf8',
         ),
       ).toBe('The answer is 42, derived through extended thinking.');
 
       expect(
         readFileSync(
-          path.join(artifactDir(outputDir, CODEX_RESULT), 'outputs', 'answer.md'),
+          path.join(runArtifactDir(outputDir, CODEX_RESULT), 'outputs', 'answer.md'),
           'utf8',
         ),
       ).toBe('Applied the requested edit to src/main.ts.');
 
       expect(
         readFileSync(
-          path.join(artifactDir(outputDir, COPILOT_RESULT), 'outputs', 'answer.md'),
+          path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'outputs', 'answer.md'),
           'utf8',
         ),
       ).toBe('function add(a, b) { return a + b }');
@@ -570,7 +592,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       expect(
-        existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'outputs', 'answer.md')),
+        existsSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'outputs', 'answer.md')),
       ).toBe(false);
     });
   });
@@ -594,12 +616,12 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('eval_2026-03-18.jsonl', content, outputDir);
 
       // Verify all artifact files exist
-      expect(existsSync(path.join(outputDir, 'benchmark.json'))).toBe(true);
-      expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(true);
+      expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true);
+      expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false);
 
       // Verify benchmark
-      const benchmark: BenchmarkArtifact = JSON.parse(
-        readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+      const benchmark: RunSummaryArtifact = JSON.parse(
+        readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
       );
 
       // 7 unique targets (claude-cli appears twice with error result)
@@ -607,26 +629,28 @@ describe('export e2e — multi-provider metrics verification', () => {
       expect(benchmark.metadata.eval_file).toBe('eval_2026-03-18.jsonl');
 
       // Verify grading files
-      expect(existsSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json'))).toBe(
-        true,
-      );
-      expect(existsSync(path.join(artifactDir(outputDir, CODEX_RESULT), 'grading.json'))).toBe(
+      expect(
+        existsSync(path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json')),
+      ).toBe(true);
+      expect(existsSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'grading.json'))).toBe(
         true,
       );
-      expect(existsSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'grading.json'))).toBe(
+      expect(existsSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'grading.json'))).toBe(
         true,
       );
-      expect(existsSync(path.join(artifactDir(outputDir, PI_RESULT), 'grading.json'))).toBe(true);
-      expect(existsSync(path.join(artifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json'))).toBe(
+      expect(existsSync(path.join(runArtifactDir(outputDir, PI_RESULT), 'grading.json'))).toBe(
         true,
       );
-      expect(existsSync(path.join(artifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe(
+      expect(
+        existsSync(path.join(runArtifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json')),
+      ).toBe(true);
+      expect(existsSync(path.join(runArtifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe(
         true,
       );
-      expect(existsSync(path.join(artifactDir(outputDir, MINIMAL_RESULT), 'grading.json'))).toBe(
+      expect(existsSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'grading.json'))).toBe(
         true,
       );
-      expect(existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'grading.json'))).toBe(
+      expect(existsSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'grading.json'))).toBe(
         true,
       );
     });
@@ -660,7 +684,10 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       const timing: TimingArtifact = JSON.parse(
         readFileSync(
-          path.join(artifactDir(outputDir, { ...record, target: 'mock' as const }), 'timing.json'),
+          path.join(
+            runArtifactDir(outputDir, { ...record, target: 'mock' as const }),
+            'timing.json',
+          ),
           'utf8',
         ),
       );
diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts
index 9bb6b283a..8e7f58c31 100644
--- a/apps/cli/test/commands/results/export.test.ts
+++ b/apps/cli/test/commands/results/export.test.ts
@@ -4,9 +4,9 @@ import { tmpdir } from 'node:os';
 import path from 'node:path';
 
 import type {
-  BenchmarkArtifact,
   GradingArtifact,
   IndexArtifactEntry,
+  RunSummaryArtifact,
   TimingArtifact,
 } from '../../../src/commands/eval/artifact-writer.js';
 import { parseJsonlResults } from '../../../src/commands/eval/artifact-writer.js';
@@ -168,6 +168,10 @@ function artifactDir(outputDir: string, record: { suite?: string; test_id?: stri
   return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId);
 }
 
+function runArtifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string {
+  return path.join(artifactDir(outputDir, record), 'run-1');
+}
+
 function readIndex(outputDir: string): IndexArtifactEntry[] {
   return readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8')
     .trim()
@@ -177,7 +181,7 @@ function readIndex(outputDir: string): IndexArtifactEntry[] {
 }
 
 function readAnswer(outputDir: string, record: { suite?: string; test_id?: string }): string {
-  return readFileSync(path.join(artifactDir(outputDir, record), 'outputs', 'answer.md'), 'utf8');
+  return readFileSync(path.join(runArtifactDir(outputDir, record), 'outputs', 'answer.md'), 'utf8');
 }
 
 describe('results export', () => {
@@ -272,7 +276,7 @@ describe('results export', () => {
     });
     expect(first.entries[0].artifact_refs).toMatchObject({
       status: 'planned_export',
-      timing_path: 'privacy/test-private/timing.json',
+      timing_path: 'privacy/test-private/run-1/timing.json',
     });
     expect(first.entries[0].artifact_refs).not.toHaveProperty('input_path');
     expect(first.entries[0].artifact_refs).not.toHaveProperty('output_path');
@@ -349,14 +353,20 @@ describe('results export', () => {
     });
     expect(bundle.entries[0].artifact_refs).toMatchObject({
       status: 'planned_export',
-      input_path: 'privacy/test-private/task/PROMPT.md',
-      output_path: 'privacy/test-private/outputs/answer.md',
-      answer_path: 'privacy/test-private/outputs/answer.md',
+      artifact_dir: 'privacy/test-private',
+      summary_path: 'privacy/test-private/summary.json',
+      grading_path: 'privacy/test-private/run-1/grading.json',
+      timing_path: 'privacy/test-private/run-1/timing.json',
+      metrics_path: 'privacy/test-private/run-1/metrics.json',
+      output_path: 'privacy/test-private/run-1/outputs/answer.md',
+      answer_path: 'privacy/test-private/run-1/outputs/answer.md',
+      transcript_path: 'privacy/test-private/run-1/transcript-raw.jsonl',
       trace_path: 'privacy/test-private/trace.json',
     });
+    expect(bundle.entries[0].artifact_refs).not.toHaveProperty('input_path');
     expect(bundle.entries[0].trace.envelope_ref).toBe('privacy/test-private/trace.json');
     expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined();
-    expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/grading.json');
+    expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/run-1/grading.json');
     expect(bundle.entries[0].raw_content).toBeDefined();
     expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence');
     expect(serialized).toContain('SECRET_PROMPT_TEXT');
@@ -366,16 +376,16 @@ describe('results export', () => {
     expect(serialized).toContain('SECRET_SCORE_EVIDENCE');
   });
 
-  it('should create benchmark.json matching artifact-writer schema', async () => {
+  it('should create summary.json matching artifact-writer schema', async () => {
     const outputDir = path.join(tempDir, 'output');
     const content = toJsonl(RESULT_FULL, RESULT_PARTIAL);
 
     await exportResults('eval_2026-03-18.jsonl', content, outputDir);
 
-    const benchmarkPath = path.join(outputDir, 'benchmark.json');
-    expect(existsSync(benchmarkPath)).toBe(true);
+    const summaryPath = path.join(outputDir, 'summary.json');
+    expect(existsSync(summaryPath)).toBe(true);
 
-    const benchmark: BenchmarkArtifact = JSON.parse(readFileSync(benchmarkPath, 'utf8'));
+    const benchmark: RunSummaryArtifact = JSON.parse(readFileSync(summaryPath, 'utf8'));
     expect(benchmark.metadata.eval_file).toBe('eval_2026-03-18.jsonl');
     expect(benchmark.metadata.timestamp).toBe('2026-03-18T10:00:01.000Z');
     // artifact-writer uses string[] for tests_run, not a count
@@ -412,13 +422,16 @@ describe('results export', () => {
       test_id: 'test-greeting',
       target: 'gpt-4o',
       execution_status: 'ok',
-      grading_path: 'demo/test-greeting/grading.json',
-      timing_path: 'demo/test-greeting/timing.json',
-      output_path: 'demo/test-greeting/outputs/answer.md',
-      answer_path: 'demo/test-greeting/outputs/answer.md',
-      transcript_path: 'demo/test-greeting/transcript.jsonl',
-      input_path: 'demo/test-greeting/task/PROMPT.md',
+      artifact_dir: 'demo/test-greeting',
+      summary_path: 'demo/test-greeting/summary.json',
+      grading_path: 'demo/test-greeting/run-1/grading.json',
+      timing_path: 'demo/test-greeting/run-1/timing.json',
+      metrics_path: 'demo/test-greeting/run-1/metrics.json',
+      output_path: 'demo/test-greeting/run-1/outputs/answer.md',
+      answer_path: 'demo/test-greeting/run-1/outputs/answer.md',
+      transcript_path: 'demo/test-greeting/run-1/transcript-raw.jsonl',
     });
+    expect(entries[0]).not.toHaveProperty('input_path');
     expect(entries[0].projection_identity).toMatchObject({
       schema_version: 'agentv.projection_identity.v1',
       dimensions: {
@@ -551,7 +564,7 @@ describe('results export', () => {
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const timingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'timing.json');
+    const timingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json');
     expect(existsSync(timingPath)).toBe(true);
 
     const timing: TimingArtifact = JSON.parse(readFileSync(timingPath, 'utf8'));
@@ -568,7 +581,7 @@ describe('results export', () => {
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const gradingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'grading.json');
+    const gradingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'grading.json');
     expect(existsSync(gradingPath)).toBe(true);
 
     const grading: GradingArtifact = JSON.parse(readFileSync(gradingPath, 'utf8'));
@@ -596,7 +609,7 @@ describe('results export', () => {
     expect(grading.graders?.[0].name).toBe('greeting_quality');
     expect(grading.graders?.[0].type).toBe('llm-grader');
 
-    const perTestTimingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'timing.json');
+    const perTestTimingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json');
     expect(existsSync(perTestTimingPath)).toBe(true);
   });
 
@@ -606,22 +619,26 @@ describe('results export', () => {
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const answerPath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'answer.md');
+    const answerPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'outputs', 'answer.md');
     expect(existsSync(answerPath)).toBe(true);
     expect(readFileSync(answerPath, 'utf8')).toBe('Hello, Alice!');
 
-    const responsePath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'response.md');
+    const responsePath = path.join(
+      runArtifactDir(outputDir, RESULT_FULL),
+      'outputs',
+      'response.md',
+    );
     expect(existsSync(responsePath)).toBe(false);
   });
 
-  it('should group results by target in benchmark.json', async () => {
+  it('should group results by target in summary.json', async () => {
     const outputDir = path.join(tempDir, 'output');
     const content = toJsonl(RESULT_FULL, RESULT_DIFFERENT_TARGET);
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const benchmark: BenchmarkArtifact = JSON.parse(
-      readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+    const benchmark: RunSummaryArtifact = JSON.parse(
+      readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
     );
 
     expect(benchmark.run_summary['gpt-4o']).toBeDefined();
@@ -644,14 +661,16 @@ describe('results export', () => {
 
     await exportResults('test.jsonl', content, outputDir);
 
-    expect(existsSync(path.join(outputDir, 'benchmark.json'))).toBe(true);
+    expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true);
     expect(existsSync(path.join(outputDir, 'index.jsonl'))).toBe(true);
-    expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(true);
-    expect(existsSync(path.join(artifactDir(outputDir, RESULT_FULL), 'grading.json'))).toBe(true);
-    expect(existsSync(path.join(artifactDir(outputDir, RESULT_PARTIAL), 'grading.json'))).toBe(
+    expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false);
+    expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_FULL), 'grading.json'))).toBe(
       true,
     );
-    expect(existsSync(path.join(artifactDir(outputDir, RESULT_NO_TRACE), 'grading.json'))).toBe(
+    expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_PARTIAL), 'grading.json'))).toBe(
+      true,
+    );
+    expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_NO_TRACE), 'grading.json'))).toBe(
       true,
     );
   });
@@ -662,8 +681,8 @@ describe('results export', () => {
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const benchmark: BenchmarkArtifact = JSON.parse(
-      readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+    const benchmark: RunSummaryArtifact = JSON.parse(
+      readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
     );
 
     expect(benchmark.per_grader_summary).toBeDefined();
@@ -677,6 +696,7 @@ describe('results export', () => {
 
     const answerPath = path.join(
       artifactDir(outputDir, RESULT_DIFFERENT_TARGET),
+      'run-1',
       'outputs',
       'answer.md',
     );
@@ -702,7 +722,7 @@ describe('results export', () => {
     await exportResults('test.jsonl', content, outputDir);
 
     const gradingPath = path.join(
-      artifactDir(outputDir, { ...minimal, target: 'default' }),
+      runArtifactDir(outputDir, { ...minimal, target: 'default' }),
       'grading.json',
     );
     expect(existsSync(gradingPath)).toBe(true);
@@ -712,7 +732,7 @@ describe('results export', () => {
     expect(grading.summary.total).toBe(0);
   });
 
-  it('should write string input to <test-id>/task/PROMPT.md', async () => {
+  it('should not write string input to a generated prompt sidecar', async () => {
     const outputDir = path.join(tempDir, 'output');
     const resultWithInput = {
       ...RESULT_FULL,
@@ -723,11 +743,11 @@ describe('results export', () => {
     await exportResults('test.jsonl', content, outputDir);
 
     const inputPath = path.join(artifactDir(outputDir, resultWithInput), 'task', 'PROMPT.md');
-    expect(existsSync(inputPath)).toBe(true);
-    expect(readFileSync(inputPath, 'utf8')).toBe('What is the capital of France?');
+    expect(existsSync(inputPath)).toBe(false);
+    expect(readIndex(outputDir)[0]).not.toHaveProperty('input_path');
   });
 
-  it('should write Message[] input to <test-id>/task/PROMPT.md as markdown', async () => {
+  it('should not write Message[] input to a generated prompt sidecar', async () => {
     const outputDir = path.join(tempDir, 'output');
     const resultWithMessages = {
       ...RESULT_FULL,
@@ -741,8 +761,8 @@ describe('results export', () => {
     await exportResults('test.jsonl', content, outputDir);
 
     const inputPath = path.join(artifactDir(outputDir, resultWithMessages), 'task', 'PROMPT.md');
-    expect(existsSync(inputPath)).toBe(true);
-    expect(readFileSync(inputPath, 'utf8')).toBe('@[user]:\nHello\n\n@[assistant]:\nHi there!');
+    expect(existsSync(inputPath)).toBe(false);
+    expect(readIndex(outputDir)[0]).not.toHaveProperty('input_path');
   });
 
   it('should not create input file when input is missing', async () => {
@@ -765,8 +785,8 @@ describe('results export', () => {
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const benchmark: BenchmarkArtifact = JSON.parse(
-      readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'),
+    const benchmark: RunSummaryArtifact = JSON.parse(
+      readFileSync(path.join(outputDir, 'summary.json'), 'utf8'),
     );
     expect(benchmark.metadata.targets).toEqual(['unknown']);
     expect(benchmark.metadata.tests_run).toEqual(['unknown']);
diff --git a/apps/cli/test/commands/results/remote-auto-export.test.ts b/apps/cli/test/commands/results/remote-auto-export.test.ts
index f1fead56e..cfbfadfe1 100644
--- a/apps/cli/test/commands/results/remote-auto-export.test.ts
+++ b/apps/cli/test/commands/results/remote-auto-export.test.ts
@@ -67,7 +67,7 @@ function writeRunArtifacts(projectDir: string): string {
     `${JSON.stringify({ test_id: 'alpha', score: 1 })}\n`,
   );
   writeFileSync(
-    path.join(runDir, 'benchmark.json'),
+    path.join(runDir, 'summary.json'),
     `${JSON.stringify({ eval_file: 'evals/example.eval.yaml', tests_run: 1 }, null, 2)}\n`,
   );
   return runDir;
@@ -121,7 +121,7 @@ function writeRunArtifactsWithPointers(projectDir: string): string {
     })}\n`,
   );
   writeFileSync(
-    path.join(runDir, 'benchmark.json'),
+    path.join(runDir, 'summary.json'),
     `${JSON.stringify({ eval_file: 'evals/example.eval.yaml', tests_run: 1 }, null, 2)}\n`,
   );
   return runDir;
@@ -227,7 +227,7 @@ describe('maybeAutoExportRunArtifacts', () => {
       rootDir,
     );
     expect(resultTree).toContain('runs/default/run-002/index.jsonl');
-    expect(resultTree).toContain('runs/default/run-002/benchmark.json');
+    expect(resultTree).toContain('runs/default/run-002/summary.json');
     expect(resultTree).not.toContain('runs/default/run-002/alpha/trace.json');
     expect(resultTree).not.toContain('runs/default/run-002/alpha/transcript.jsonl');
     const index = JSON.parse(
diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts
index e4767858a..0e69332ac 100644
--- a/apps/cli/test/commands/results/report.test.ts
+++ b/apps/cli/test/commands/results/report.test.ts
@@ -76,14 +76,14 @@ describe('results report', () => {
     expect(deriveReportPath(sourceFile)).toBe(path.join(tempDir, 'run', 'report.html'));
   });
 
-  it('loads benchmark eval file metadata from a run workspace', async () => {
+  it('loads run summary eval file metadata from a run workspace', async () => {
     const runDir = path.join(tempDir, 'run');
     await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });
 
     const loaded = await loadReportSource(runDir, tempDir);
 
     expect(loaded.results).toHaveLength(1);
-    expect(loaded.benchmarkEvalFile).toBe('demo');
+    expect(loaded.summaryEvalFile).toBe('demo');
   });
 
   it('writes a static HTML report with grouped eval files and assertion type badges', async () => {
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 64fe8d68a..ca5f6943b 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -264,7 +264,7 @@ function writeRemoteRunArtifact(
   const records = Array.isArray(resultRecords) ? resultRecords : [resultRecords];
   writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(...records));
   writeFileSync(
-    path.join(runDir, 'benchmark.json'),
+    path.join(runDir, 'summary.json'),
     JSON.stringify(
       {
         metadata: {
@@ -303,7 +303,7 @@ function writeDirtyRemoteRunArtifact(
   mkdirSync(runDir, { recursive: true });
   writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultRecord));
   writeFileSync(
-    path.join(runDir, 'benchmark.json'),
+    path.join(runDir, 'summary.json'),
     JSON.stringify(
       {
         metadata: {
@@ -354,7 +354,7 @@ function writeLocalRunArtifact(
   mkdirSync(runDir, { recursive: true });
   writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl({ ...resultRecord, experiment }));
   writeFileSync(
-    path.join(runDir, 'benchmark.json'),
+    path.join(runDir, 'summary.json'),
     JSON.stringify(
       {
         metadata: {
@@ -1597,7 +1597,7 @@ describe('serve app', () => {
       expect(git('git branch --show-current', cloneDir)).toBe('main');
       expect(
         git(
-          'git show-ref --verify --quiet refs/remotes/agentv-results/agentv-results && echo present || true',
+          'git show-ref --verify --quiet refs/remotes/origin/agentv-results && echo present || true',
           cloneDir,
         ),
       ).toBe('');
@@ -2340,6 +2340,7 @@ describe('serve app', () => {
           '2026-03-26T12-30-00-000Z',
           RESULT_A,
         );
+        git(`git remote set-url origin "${missingRemoteUrl}"`, cloneDir);
         const app = createApp([], tempDir, tempDir, undefined, { studioDir });
         const res = await app.request('/api/projects/project-sync-offline/remote/sync', {
           method: 'POST',
@@ -2679,7 +2680,7 @@ describe('serve app', () => {
       };
       expect(tags.tags.sort()).toEqual(['baseline', 'candidate', 'shared']);
       const benchmark = JSON.parse(
-        readFileSync(path.join(combinedDir, 'benchmark.json'), 'utf8'),
+        readFileSync(path.join(combinedDir, 'summary.json'), 'utf8'),
       ) as {
         metadata: { combined_from_run_ids?: string[]; display_name?: string; timestamp?: string };
       };
@@ -3460,7 +3461,7 @@ describe('serve app', () => {
         }),
       );
       writeFileSync(
-        path.join(runDir, 'benchmark.json'),
+        path.join(runDir, 'summary.json'),
         JSON.stringify(
           {
             metadata: {
@@ -3496,7 +3497,7 @@ describe('serve app', () => {
         autoPush: false,
       });
 
-      const artifactRemoteRef = `refs/remotes/agentv-results/${AGENTV_RESULTS_ARTIFACTS_REF}`;
+      const artifactRemoteRef = `refs/remotes/origin/${AGENTV_RESULTS_ARTIFACTS_REF}`;
       const artifactRefLookup = () =>
         git(
           `git -C "${cloneDir}" show-ref --verify --quiet ${artifactRemoteRef} && echo present || true`,
@@ -4404,11 +4405,11 @@ describe('serve app', () => {
   //
   // The Dashboard "Resume run" / "Rerun failed cases" buttons need the run dir
   // and the original eval file path to issue a launch request that targets
-  // the same run workspace. handleRunDetail reads benchmark.json's
+  // the same run workspace. handleRunDetail reads summary.json's
   // metadata.eval_file and reports the run dir relative to cwd.
 
   describe('GET /api/runs/:filename (resume metadata)', () => {
-    it('includes run_dir and suite_filter for local runs with benchmark.json', async () => {
+    it('includes run_dir and suite_filter for local runs with summary.json', async () => {
       const runsDir = localResultsExperimentDir(tempDir);
       mkdirSync(runsDir, { recursive: true });
       const filename = '2026-05-06T00-00-00-000Z';
@@ -4416,7 +4417,7 @@ describe('serve app', () => {
       mkdirSync(runDir, { recursive: true });
       writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A));
       writeFileSync(
-        path.join(runDir, 'benchmark.json'),
+        path.join(runDir, 'summary.json'),
         JSON.stringify(
           {
             metadata: {
@@ -4446,7 +4447,7 @@ describe('serve app', () => {
       expect(data.suite_filter).toBe('examples/demo.eval.yaml');
     });
 
-    it('omits suite_filter when benchmark.json is missing', async () => {
+    it('omits suite_filter when summary.json is missing', async () => {
       const runsDir = localResultsExperimentDir(tempDir);
       mkdirSync(runsDir, { recursive: true });
       const filename = '2026-05-06T00-00-01-000Z';
diff --git a/apps/cli/test/commands/results/validate.test.ts b/apps/cli/test/commands/results/validate.test.ts
index 5ebdd0742..3836b90bc 100644
--- a/apps/cli/test/commands/results/validate.test.ts
+++ b/apps/cli/test/commands/results/validate.test.ts
@@ -25,7 +25,29 @@ describe('results validate', () => {
           test_id: 'test-greeting',
           score: 1,
           target: 'gpt-4o',
+          scores: [{ name: 'quality', type: 'llm', score: 1, verdict: 'pass' }],
           execution_status: 'ok',
+          summary_path: 'test-greeting/summary.json',
+        })}\n`,
+      );
+      mkdirSync(path.join(runDir, 'test-greeting'), { recursive: true });
+      writeFileSync(
+        path.join(runDir, 'test-greeting', 'summary.json'),
+        `${JSON.stringify({
+          test_id: 'test-greeting',
+          score: 1,
+          target: 'gpt-4o',
+          execution_status: 'ok',
+        })}\n`,
+      );
+      writeFileSync(
+        path.join(runDir, 'summary.json'),
+        `${JSON.stringify({
+          schema_version: 1,
+          metadata: {
+            experiment: 'with-skills',
+            timestamp: '2026-03-27T12:42:24.429Z',
+          },
         })}\n`,
       );
 
diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts
index 91b8934cf..dd9629bf6 100644
--- a/apps/cli/test/commands/trace/trace.test.ts
+++ b/apps/cli/test/commands/trace/trace.test.ts
@@ -408,7 +408,7 @@ describe('trace utils', () => {
 
       writeFileSync(path.join(runDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n`);
       writeFileSync(
-        path.join(runDir, 'benchmark.json'),
+        path.join(runDir, 'summary.json'),
         JSON.stringify({
           metadata: {
             display_name: 'Combined run (dogfood-run-a + dogfood-run-b)',
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 083819931..7972e1ea8 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -343,10 +343,11 @@ describe('agentv eval CLI', () => {
 
       const results = await readJsonLines(path.join(outputDir, 'index.jsonl'));
       expect(results).toHaveLength(2);
-      await expectFileExists(path.join(outputDir, 'benchmark.json'));
-      await expectFileExists(path.join(outputDir, 'timing.json'));
-      await expectFileExists(path.join(outputDir, 'case-alpha', 'grading.json'));
-      await expectFileExists(path.join(outputDir, 'case-beta', 'grading.json'));
+      await expectFileExists(path.join(outputDir, 'summary.json'));
+      await expectFileExists(path.join(outputDir, 'case-alpha', 'summary.json'));
+      await expectFileExists(path.join(outputDir, 'case-alpha', 'run-1', 'grading.json'));
+      await expectFileExists(path.join(outputDir, 'case-beta', 'summary.json'));
+      await expectFileExists(path.join(outputDir, 'case-beta', 'run-1', 'grading.json'));
     } finally {
       await rm(fixture.baseDir, { recursive: true, force: true });
     }
@@ -363,8 +364,9 @@ describe('agentv eval CLI', () => {
       expect(exitCode).toBe(0);
       expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl'));
       await expectFileExists(path.join(outputDir, 'index.jsonl'));
-      await expectFileExists(path.join(outputDir, 'benchmark.json'));
-      await expectFileExists(path.join(outputDir, 'case-alpha', 'grading.json'));
+      await expectFileExists(path.join(outputDir, 'summary.json'));
+      await expectFileExists(path.join(outputDir, 'case-alpha', 'summary.json'));
+      await expectFileExists(path.join(outputDir, 'case-alpha', 'run-1', 'grading.json'));
     } finally {
       await rm(fixture.baseDir, { recursive: true, force: true });
     }
@@ -403,10 +405,9 @@ describe('agentv eval CLI', () => {
 
       const canonicalResults = await readJsonLines(path.join(outputDir, 'index.jsonl'));
       expect(canonicalResults).toHaveLength(2);
-      await expectFileExists(path.join(outputDir, 'benchmark.json'));
-      await expectFileExists(path.join(outputDir, 'timing.json'));
+      await expectFileExists(path.join(outputDir, 'summary.json'));
       for (const row of canonicalResults) {
-        expect(row.transcript_path).toMatch(/transcript\.jsonl$/);
+        expect(row.transcript_path).toMatch(/run-1\/transcript-raw\.jsonl$/);
         await expectFileExists(path.join(outputDir, row.transcript_path as string));
       }
     } finally {
@@ -592,7 +593,7 @@ describe('agentv eval CLI', () => {
       await expectFileExists(path.join(fixture.suiteDir, 'experiment-script.txt'));
 
       const benchmark = JSON.parse(
-        await readFile(path.join(path.dirname(outputPath), 'benchmark.json'), 'utf8'),
+        await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'),
       ) as { metadata?: Record<string, unknown> };
       expect(benchmark.metadata?.experiment).toBe('native-exp');
       expect(benchmark.metadata?.experiment_config).toMatchObject({
@@ -782,7 +783,7 @@ describe('agentv eval CLI', () => {
     const helpText = `${result.stdout}\n${result.stderr}`;
     expect(helpText).not.toContain('--benchmark-json');
     expect(helpText).toContain('--output');
-    expect(helpText).toContain('benchmark.json');
+    expect(helpText).toContain('summary.json');
   }, 30_000);
 
   it('rejects the removed benchmark JSON export flag as an unknown argument', async () => {
@@ -792,7 +793,7 @@ describe('agentv eval CLI', () => {
         'eval',
         fixture.testFilePath,
         '--benchmark-json',
-        path.join(fixture.baseDir, 'benchmark.json'),
+        path.join(fixture.baseDir, 'summary.json'),
       ]);
 
       expect(result.exitCode).not.toBe(0);
diff --git a/apps/dashboard/src/components/EvalDetail.tsx b/apps/dashboard/src/components/EvalDetail.tsx
index 0ffd153af..4592197a6 100644
--- a/apps/dashboard/src/components/EvalDetail.tsx
+++ b/apps/dashboard/src/components/EvalDetail.tsx
@@ -6,7 +6,7 @@
  * Assertions are grouped by grader name.
  */
 
-import { useMemo, useState } from 'react';
+import { useEffect, useMemo, useState } from 'react';
 
 import { useQuery } from '@tanstack/react-query';
 import {
@@ -20,8 +20,10 @@ import {
   useEvalTranscript,
   useStudioConfig,
 } from '~/lib/api';
+import type { RepeatRunGroup } from '~/lib/result-table';
 import type {
   AssertionEntry,
+  EvalCaseTrial,
   EvalResult,
   ScoreEntry,
   SourceCapturedFile,
@@ -40,6 +42,11 @@ interface EvalDetailProps {
   eval: EvalResult;
   runId: string;
   projectId?: string;
+  repeatGroup?: RepeatRunGroup;
+  selectedTrial?: EvalCaseTrial | null;
+  initialTab?: Tab;
+  initialSelectedFilePath?: string | null;
+  onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void;
 }
 
 type Tab = 'checks' | 'transcript' | 'source' | 'files' | 'feedback';
@@ -56,11 +63,90 @@ function findFirstFile(nodes: FileNode[]): string | null {
   return null;
 }
 
-export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) {
-  const [activeTab, setActiveTab] = useState<Tab>('checks');
-  const [selectedFilePath, setSelectedFilePath] = useState<string | null>(null);
+function caseTrialPath(trial: EvalCaseTrial, index = 0): string {
+  return trial.run_path ?? `run-${trial.attempt ?? index + 1}`;
+}
+
+function caseTrialTokenTotal(trial: EvalCaseTrial): number | undefined {
+  if (trial.total_tokens != null) return trial.total_tokens;
+  const usage = trial.token_usage;
+  if (!usage) return undefined;
+  const values = [usage.input, usage.output, usage.reasoning, usage.cached].filter(
+    (value): value is number => typeof value === 'number' && Number.isFinite(value),
+  );
+  return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : undefined;
+}
+
+function formatPercent(value: number | undefined): string {
+  if (value == null || !Number.isFinite(value)) return '-';
+  return `${Math.round(value * 100)}%`;
+}
+
+function formatDuration(durationMs: number | undefined): string {
+  if (durationMs == null) return '-';
+  if (durationMs < 1000) return `${Math.round(durationMs)}ms`;
+  if (durationMs < 60_000) return `${(durationMs / 1000).toFixed(1)}s`;
+  const minutes = Math.floor(durationMs / 60_000);
+  const seconds = Math.round((durationMs % 60_000) / 1000);
+  return `${minutes}m ${seconds}s`;
+}
+
+function formatCost(costUsd: number | undefined): string | undefined {
+  if (costUsd == null) return undefined;
+  if (costUsd === 0) return '$0';
+  if (costUsd < 0.01) return `$${costUsd.toFixed(5)}`;
+  return `$${costUsd.toFixed(4)}`;
+}
+
+function formatTokens(tokens: number | undefined): string | undefined {
+  if (tokens == null) return undefined;
+  if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M tok`;
+  if (tokens >= 1000) return `${(tokens / 1000).toFixed(1)}k tok`;
+  return `${tokens} tok`;
+}
+
+function selectedTrialResult(result: EvalResult, trial: EvalCaseTrial): EvalResult {
+  return {
+    ...result,
+    score: trial.score ?? result.score,
+    executionStatus: trial.execution_status ?? result.executionStatus,
+    error: trial.error,
+    costUsd: trial.cost_usd ?? result.costUsd,
+    durationMs: trial.duration_ms ?? result.durationMs,
+    scores: trial.scores,
+    assertions: trial.assertions,
+    trials: undefined,
+    aggregation: undefined,
+    grading_path: trial.grading_path,
+    timing_path: trial.timing_path,
+    metrics_path: trial.metrics_path,
+    transcript_path: trial.transcript_path,
+    output_path: trial.answer_path,
+    answer_path: trial.answer_path,
+  };
+}
+
+export function EvalDetail({
+  eval: result,
+  runId,
+  projectId,
+  repeatGroup,
+  selectedTrial = null,
+  initialTab = 'checks',
+  initialSelectedFilePath = null,
+  onSelectTrial,
+}: EvalDetailProps) {
+  const [activeTab, setActiveTab] = useState<Tab>(initialTab);
+  const [selectedFilePath, setSelectedFilePath] = useState<string | null>(initialSelectedFilePath);
   const { data: config } = useStudioConfig(projectId);
   const isReadOnly = config?.read_only === true;
+  const detailResult = selectedTrial ? selectedTrialResult(result, selectedTrial) : result;
+  const showAggregateRepeat = repeatGroup != null && selectedTrial == null;
+
+  useEffect(() => {
+    setActiveTab(initialTab);
+    setSelectedFilePath(initialSelectedFilePath);
+  }, [initialTab, initialSelectedFilePath]);
 
   const tabs: { id: Tab; label: string }[] = [
     { id: 'checks', label: 'Checks' },
@@ -76,7 +162,7 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
   };
 
   return (
-    <div className="flex h-full min-h-full flex-col">
+    <div className="flex h-full min-h-0 flex-col">
       {/* Tab navigation — at the top so Files tab editor fills maximum height */}
       <div className="border-b border-gray-800">
         <div className="flex gap-1 px-4">
@@ -98,16 +184,32 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
       </div>
 
       {/* Tab content */}
-      <div className="min-h-0 flex-1">
+      <div className="min-h-0 flex-1 overflow-hidden">
         {activeTab === 'checks' && (
           <div className="overflow-auto p-4">
-            <ChecksTab result={result} projectId={projectId} />
+            {showAggregateRepeat ? (
+              <RepeatAggregateChecksTab
+                result={result}
+                group={repeatGroup}
+                onSelectTrial={onSelectTrial}
+              />
+            ) : selectedTrial ? (
+              <TrialChecksTab
+                result={detailResult}
+                trial={selectedTrial}
+                runId={runId}
+                projectId={projectId}
+                onOpenFile={openFile}
+              />
+            ) : (
+              <ChecksTab result={detailResult} projectId={projectId} />
+            )}
           </div>
         )}
         {activeTab === 'files' && (
-          <div className="h-full p-4">
+          <div className="h-full min-h-0 p-4">
             <FilesTab
-              result={result}
+              result={detailResult}
               runId={runId}
               projectId={projectId}
               selectedPath={selectedFilePath}
@@ -117,22 +219,40 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
         )}
         {activeTab === 'transcript' && (
           <div className="overflow-auto p-4">
-            <TranscriptTab
-              result={result}
-              runId={runId}
-              projectId={projectId}
-              onOpenFile={openFile}
-            />
+            {showAggregateRepeat ? (
+              <RepeatAggregateTranscriptTab
+                result={result}
+                group={repeatGroup}
+                runId={runId}
+                projectId={projectId}
+                onSelectTrial={onSelectTrial}
+              />
+            ) : selectedTrial ? (
+              <TrialTranscriptTab
+                result={detailResult}
+                trial={selectedTrial}
+                runId={runId}
+                projectId={projectId}
+                onOpenFile={openFile}
+              />
+            ) : (
+              <TranscriptTab
+                result={detailResult}
+                runId={runId}
+                projectId={projectId}
+                onOpenFile={openFile}
+              />
+            )}
           </div>
         )}
         {activeTab === 'source' && (
           <div className="overflow-auto p-4">
-            <SourceTab result={result} />
+            <SourceTab result={detailResult} />
           </div>
         )}
         {!isReadOnly && activeTab === 'feedback' && (
           <div className="p-4">
-            <FeedbackPanel testId={result.testId} projectId={projectId} />
+            <FeedbackPanel testId={detailResult.testId} projectId={projectId} />
           </div>
         )}
       </div>
@@ -433,6 +553,231 @@ function ChecksTab({ result, projectId }: { result: EvalResult; projectId?: stri
   );
 }
 
+function RunMetricRow({ label, value }: { label: string; value: string | undefined }) {
+  return (
+    <div className="rounded-lg border border-gray-800 bg-gray-900 p-3">
+      <div className="text-xs font-medium uppercase tracking-wider text-gray-500">{label}</div>
+      <div className="mt-1 font-mono text-sm text-gray-200">{value ?? '-'}</div>
+    </div>
+  );
+}
+
+function TrialActionRow({
+  trial,
+  index,
+  onSelectTrial,
+}: {
+  trial: EvalCaseTrial;
+  index: number;
+  onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void;
+}) {
+  const label = caseTrialPath(trial, index);
+  return (
+    <div className="grid gap-2 rounded-md border border-gray-800 bg-gray-950/50 p-3 text-sm md:grid-cols-[minmax(8rem,1fr)_auto] md:items-center">
+      <div className="min-w-0">
+        <div className="font-medium text-gray-200">{label}</div>
+        <div className="mt-1 flex flex-wrap gap-x-3 gap-y-1 text-xs text-gray-500">
+          <span>{formatPercent(trial.score)} score</span>
+          <span>{trial.verdict ?? 'unknown'}</span>
+          {trial.duration_ms != null ? <span>{formatDuration(trial.duration_ms)}</span> : null}
+          {trial.total_tool_calls != null ? <span>{trial.total_tool_calls} tool calls</span> : null}
+        </div>
+      </div>
+      <div className="flex flex-wrap gap-2 md:justify-end">
+        <button
+          type="button"
+          onClick={() => onSelectTrial?.(trial, 'checks')}
+          className="rounded-md border border-gray-700 px-2.5 py-1 text-xs text-gray-300 transition-colors hover:border-cyan-900/60 hover:text-cyan-300"
+        >
+          Checks
+        </button>
+        <button
+          type="button"
+          onClick={() => onSelectTrial?.(trial, 'files')}
+          disabled={!trial.grading_path && !trial.metrics_path && !trial.timing_path}
+          className="rounded-md border border-gray-700 px-2.5 py-1 text-xs text-gray-300 transition-colors hover:border-cyan-900/60 hover:text-cyan-300 disabled:cursor-not-allowed disabled:opacity-50"
+        >
+          Files
+        </button>
+      </div>
+    </div>
+  );
+}
+
+function RepeatAggregateChecksTab({
+  result,
+  group,
+  onSelectTrial,
+}: {
+  result: EvalResult;
+  group: RepeatRunGroup;
+  onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void;
+}) {
+  return (
+    <div className="space-y-6">
+      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+        <div className="grid gap-3 md:grid-cols-4">
+          <RunMetricRow label="Run success" value={formatPercent(group.passRate)} />
+          <RunMetricRow label="Mean score" value={formatPercent(group.meanScore)} />
+          <RunMetricRow label="Passed runs" value={`${group.passedTrials}/${group.trialCount}`} />
+          <RunMetricRow label="Assertions" value={formatPercent(group.assertionPassRate)} />
+        </div>
+      </div>
+
+      {result.scores && result.scores.length > 0 ? (
+        <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+          <h4 className="mb-3 text-sm font-medium text-gray-400">Aggregate Grader Scores</h4>
+          <div className="space-y-3">
+            {result.scores.map((score, index) => (
+              <div key={`${score.name ?? score.type ?? index}`} className="flex items-center gap-4">
+                <span className="w-40 truncate text-sm text-gray-300">
+                  {score.name ?? score.type ?? `Score ${index + 1}`}
+                </span>
+                <div className="flex-1">
+                  <ScoreBar score={score.score} />
+                </div>
+              </div>
+            ))}
+          </div>
+        </div>
+      ) : null}
+
+      <section className="space-y-3">
+        <h4 className="text-xs font-semibold uppercase tracking-wider text-gray-300">Runs</h4>
+        {group.trials.map((trial, index) => (
+          <TrialActionRow
+            key={caseTrialPath(trial, index)}
+            trial={trial}
+            index={index}
+            onSelectTrial={onSelectTrial}
+          />
+        ))}
+      </section>
+    </div>
+  );
+}
+
+type ParsedGradingArtifact = {
+  assertions: AssertionEntry[];
+  summary?: {
+    passed?: number;
+    failed?: number;
+    total?: number;
+    pass_rate?: number;
+  };
+  error?: string;
+};
+
+function parseGradingArtifact(content: string | undefined): ParsedGradingArtifact | null {
+  if (!content) return null;
+  try {
+    const parsed = JSON.parse(content) as Record<string, unknown>;
+    const rawAssertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
+    const assertions = rawAssertions.flatMap((value): AssertionEntry[] => {
+      if (!value || typeof value !== 'object') return [];
+      const assertion = value as Record<string, unknown>;
+      if (typeof assertion.text !== 'string' || typeof assertion.passed !== 'boolean') {
+        return [];
+      }
+      return [
+        {
+          text: assertion.text,
+          passed: assertion.passed,
+          evidence: typeof assertion.evidence === 'string' ? assertion.evidence : undefined,
+        },
+      ];
+    });
+    const summary =
+      parsed.summary && typeof parsed.summary === 'object' ? parsed.summary : undefined;
+    return { assertions, summary: summary as ParsedGradingArtifact['summary'] };
+  } catch (error) {
+    return { assertions: [], error: error instanceof Error ? error.message : String(error) };
+  }
+}
+
+function TrialChecksTab({
+  result,
+  trial,
+  runId,
+  projectId,
+  onOpenFile,
+}: {
+  result: EvalResult;
+  trial: EvalCaseTrial;
+  runId: string;
+  projectId?: string;
+  onOpenFile: (path: string) => void;
+}) {
+  const gradingPath = trial.grading_path;
+  const artifactDir = result.artifact_dir;
+  const evalId = result.testId;
+  const { data: gradingContent, isLoading } =
+    projectId && gradingPath
+      ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, gradingPath, artifactDir))
+      : useEvalFileContent(runId, evalId, gradingPath ?? '', artifactDir);
+  const parsed = parseGradingArtifact(gradingContent?.content);
+
+  if (!gradingPath) {
+    return <ChecksTab result={result} projectId={projectId} />;
+  }
+
+  return (
+    <div className="space-y-6">
+      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+        <div className="flex items-center gap-4">
+          <span className="text-sm font-medium text-gray-400">Run score</span>
+          <div className="flex-1">
+            <ScoreBar score={result.score} />
+          </div>
+        </div>
+      </div>
+
+      <div className="grid gap-3 md:grid-cols-3">
+        <RunMetricRow label="Duration" value={formatDuration(trial.duration_ms)} />
+        <RunMetricRow label="Cost" value={formatCost(trial.cost_usd)} />
+        <RunMetricRow label="Tokens" value={formatTokens(caseTrialTokenTotal(trial))} />
+      </div>
+
+      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+        <div className="flex flex-wrap items-center justify-between gap-3">
+          <h4 className="text-sm font-medium text-gray-400">Grading</h4>
+          <button
+            type="button"
+            onClick={() => onOpenFile(gradingPath)}
+            className="rounded-md border border-gray-700 px-3 py-1.5 text-xs text-gray-300 transition-colors hover:border-cyan-900/60 hover:text-cyan-300"
+          >
+            Open grading JSON
+          </button>
+        </div>
+        {isLoading ? (
+          <p className="mt-3 text-sm text-gray-500">Loading grading artifact...</p>
+        ) : null}
+        {parsed?.error ? <p className="mt-3 text-sm text-red-300">{parsed.error}</p> : null}
+        {parsed?.summary ? (
+          <div className="mt-3 grid gap-3 md:grid-cols-3">
+            <RunMetricRow
+              label="Assertion pass rate"
+              value={formatPercent(parsed.summary.pass_rate)}
+            />
+            <RunMetricRow label="Passed" value={String(parsed.summary.passed ?? 0)} />
+            <RunMetricRow label="Failed" value={String(parsed.summary.failed ?? 0)} />
+          </div>
+        ) : null}
+      </div>
+
+      {parsed && parsed.assertions.length > 0 ? (
+        <div className="space-y-2">
+          {parsed.assertions.map((assertion, index) => (
+            <AssertionCard key={`${assertion.text}-${index}`} assertion={assertion} />
+          ))}
+        </div>
+      ) : !isLoading ? (
+        <p className="text-sm text-gray-500">No assertion steps recorded in grading.json.</p>
+      ) : null}
+    </div>
+  );
+}
+
 function containsFilePath(nodes: FileNode[], filePath: string | null): boolean {
   if (!filePath) return false;
   for (const node of nodes) {
@@ -442,6 +787,193 @@ function containsFilePath(nodes: FileNode[], filePath: string | null): boolean {
   return false;
 }
 
+function RepeatAggregateTranscriptTab({
+  result,
+  group,
+  runId,
+  projectId,
+  onSelectTrial,
+}: {
+  result: EvalResult;
+  group: RepeatRunGroup;
+  runId: string;
+  projectId?: string;
+  onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void;
+}) {
+  return (
+    <section className="space-y-3">
+      <h4 className="text-xs font-semibold uppercase tracking-wider text-gray-300">
+        Run transcripts
+      </h4>
+      {group.trials.map((trial, index) => {
+        const runLabel = caseTrialPath(trial, index);
+        const transcriptPath = trial.transcript_path;
+        const transcriptHref = transcriptPath
+          ? artifactFileContentUrl({
+              projectId,
+              runId,
+              evalId: result.testId,
+              filePath: transcriptPath,
+              artifactDir: result.artifact_dir,
+              raw: true,
+            })
+          : undefined;
+        return (
+          <div
+            key={runLabel}
+            className="grid gap-2 rounded-md border border-gray-800 bg-gray-950/50 p-3 text-sm md:grid-cols-[minmax(8rem,1fr)_auto] md:items-center"
+          >
+            <div className="min-w-0">
+              <div className="font-medium text-gray-200">{runLabel}</div>
+              <div className="mt-1 truncate font-mono text-xs text-gray-500" title={transcriptPath}>
+                {transcriptPath ?? 'No transcript artifact'}
+              </div>
+            </div>
+            <div className="flex flex-wrap gap-2 md:justify-end">
+              <button
+                type="button"
+                onClick={() => onSelectTrial?.(trial, 'transcript')}
+                disabled={!transcriptPath}
+                className="rounded-md border border-gray-700 px-2.5 py-1 text-xs text-gray-300 transition-colors hover:border-cyan-900/60 hover:text-cyan-300 disabled:cursor-not-allowed disabled:opacity-50"
+              >
+                View
+              </button>
+              {transcriptHref ? (
+                <a
+                  href={transcriptHref}
+                  target="_blank"
+                  rel="noreferrer"
+                  className="rounded-md px-2.5 py-1 text-xs text-cyan-400 transition-colors hover:text-cyan-300 hover:underline"
+                >
+                  Raw
+                </a>
+              ) : null}
+            </div>
+          </div>
+        );
+      })}
+    </section>
+  );
+}
+
+function TrialTranscriptTab({
+  result,
+  trial,
+  runId,
+  projectId,
+  onOpenFile,
+}: {
+  result: EvalResult;
+  trial: EvalCaseTrial;
+  runId: string;
+  projectId?: string;
+  onOpenFile: (path: string) => void;
+}) {
+  const evalId = result.testId;
+  const artifactDir = result.artifact_dir;
+  const transcriptPath = trial.transcript_path;
+  const answerPath = trial.answer_path;
+  const { data: transcriptContent, isLoading: isLoadingTranscript } =
+    projectId && transcriptPath
+      ? useQuery(
+          projectEvalFileContentOptions(projectId, runId, evalId, transcriptPath, artifactDir),
+        )
+      : useEvalFileContent(runId, evalId, transcriptPath ?? '', artifactDir);
+  const { data: answerContent } =
+    projectId && answerPath
+      ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, answerPath, artifactDir))
+      : useEvalFileContent(runId, evalId, answerPath ?? '', artifactDir);
+
+  const transcriptValue = transcriptContent?.content ?? '';
+  const parsedTranscript = useMemo(() => parseTranscriptJsonl(transcriptValue), [transcriptValue]);
+
+  if (!transcriptPath) {
+    return (
+      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+        <h3 className="text-sm font-medium text-gray-300">No structured transcript</h3>
+        <p className="mt-2 text-sm text-gray-500">
+          This run does not include a transcript artifact.
+        </p>
+      </div>
+    );
+  }
+
+  if (isLoadingTranscript) {
+    return (
+      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4 text-sm text-gray-500">
+        Loading transcript artifact...
+      </div>
+    );
+  }
+
+  if (parsedTranscript.error) {
+    return (
+      <div className="rounded-lg border border-red-900/50 bg-red-950/20 p-4">
+        <h3 className="text-sm font-medium text-red-300">Transcript could not be parsed</h3>
+        <p className="mt-2 text-sm text-gray-300">{parsedTranscript.error}</p>
+        <button
+          type="button"
+          onClick={() => onOpenFile(transcriptPath)}
+          className="mt-3 rounded-md border border-gray-700 px-3 py-1.5 text-sm text-gray-300 transition-colors hover:border-cyan-900/60 hover:text-cyan-300"
+        >
+          Open raw JSONL in Files
+        </button>
+      </div>
+    );
+  }
+
+  if (parsedTranscript.entries.length === 0) {
+    return (
+      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+        <h3 className="text-sm font-medium text-gray-300">Empty transcript</h3>
+        <p className="mt-2 text-sm text-gray-500">
+          <code>{transcriptPath}</code> exists but contains no JSONL rows.
+        </p>
+      </div>
+    );
+  }
+
+  const answerHref = answerPath
+    ? artifactFileContentUrl({
+        projectId,
+        runId,
+        evalId,
+        filePath: answerPath,
+        artifactDir,
+        raw: true,
+      })
+    : undefined;
+  const transcriptHref = artifactFileContentUrl({
+    projectId,
+    runId,
+    evalId,
+    filePath: transcriptPath,
+    artifactDir,
+    raw: true,
+  });
+  const transcriptDownloadHref = artifactFileContentUrl({
+    projectId,
+    runId,
+    evalId,
+    filePath: transcriptPath,
+    artifactDir,
+    download: true,
+  });
+
+  return (
+    <TranscriptTimeline
+      entries={parsedTranscript.entries}
+      finalAnswer={answerPath ? (answerContent?.content ?? result.output) : undefined}
+      answerPath={answerPath}
+      transcriptPath={transcriptPath}
+      answerHref={answerHref}
+      transcriptHref={transcriptHref}
+      transcriptDownloadHref={transcriptDownloadHref}
+      onOpenFile={onOpenFile}
+    />
+  );
+}
+
 function TranscriptTab({
   result,
   runId,
@@ -454,13 +986,14 @@ function TranscriptTab({
   onOpenFile: (path: string) => void;
 }) {
   const evalId = result.testId;
+  const artifactDir = result.artifact_dir;
   const {
     data: transcriptData,
     isLoading: isLoadingTranscript,
     error: transcriptError,
   } = projectId
-    ? useQuery(projectEvalTranscriptOptions(projectId, runId, evalId))
-    : useEvalTranscript(runId, evalId);
+    ? useQuery(projectEvalTranscriptOptions(projectId, runId, evalId, artifactDir))
+    : useEvalTranscript(runId, evalId, artifactDir);
   const transcriptPath = transcriptData?.transcript_path;
   const answerPath = transcriptData?.answer_path;
   const transcriptContent = transcriptData?.status === 'ok' ? (transcriptData.content ?? '') : '';
@@ -541,6 +1074,7 @@ function TranscriptTab({
                   runId,
                   evalId,
                   filePath: transcriptPath,
+                  artifactDir,
                   raw: true,
                 })}
                 target="_blank"
@@ -568,7 +1102,14 @@ function TranscriptTab({
   }
 
   const answerHref = answerPath
-    ? artifactFileContentUrl({ projectId, runId, evalId, filePath: answerPath, raw: true })
+    ? artifactFileContentUrl({
+        projectId,
+        runId,
+        evalId,
+        filePath: answerPath,
+        artifactDir,
+        raw: true,
+      })
     : undefined;
   const transcriptHref = transcriptPath
     ? artifactFileContentUrl({
@@ -576,6 +1117,7 @@ function TranscriptTab({
         runId,
         evalId,
         filePath: transcriptPath,
+        artifactDir,
         raw: true,
       })
     : undefined;
@@ -585,6 +1127,7 @@ function TranscriptTab({
         runId,
         evalId,
         filePath: transcriptPath,
+        artifactDir,
         download: true,
       })
     : undefined;
@@ -617,11 +1160,12 @@ function FilesTab({
   onSelectedPathChange: (path: string) => void;
 }) {
   const evalId = result.testId;
+  const artifactDir = result.artifact_dir;
 
   // Use project-scoped API hooks when projectId is present
   const { data: filesData } = projectId
-    ? useQuery(projectEvalFilesOptions(projectId, runId, evalId))
-    : useEvalFiles(runId, evalId);
+    ? useQuery(projectEvalFilesOptions(projectId, runId, evalId, artifactDir))
+    : useEvalFiles(runId, evalId, artifactDir);
   const files = filesData?.files ?? [];
 
   const [localSelectedPath, setLocalSelectedPath] = useState<string | null>(null);
@@ -635,8 +1179,10 @@ function FilesTab({
       : null;
 
   const { data: fileContentData, isLoading: isLoadingContent } = projectId
-    ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, effectivePath ?? ''))
-    : useEvalFileContent(runId, evalId, effectivePath ?? '');
+    ? useQuery(
+        projectEvalFileContentOptions(projectId, runId, evalId, effectivePath ?? '', artifactDir),
+      )
+    : useEvalFileContent(runId, evalId, effectivePath ?? '', artifactDir);
 
   if (files.length === 0) {
     return <p className="text-sm text-gray-500">No artifact files available.</p>;
@@ -651,9 +1197,11 @@ function FilesTab({
   const displayLanguage = effectivePath ? (fileContentData?.language ?? 'plaintext') : 'plaintext';
 
   return (
-    <div className="relative flex h-full min-h-[400px] gap-4">
+    <div className="relative flex h-full min-h-0 min-w-0 gap-4 overflow-hidden">
       {/* FileTree panel — desktop: side-by-side, mobile: full-width slide-over */}
-      <div className={`${mobileShowTree ? 'block' : 'hidden'} md:block w-full md:w-auto`}>
+      <div
+        className={`${mobileShowTree ? 'block' : 'hidden'} min-h-0 w-full overflow-auto md:block md:w-auto`}
+      >
         <FileTree
           files={files}
           selectedPath={effectivePath}
@@ -667,7 +1215,9 @@ function FilesTab({
       </div>
 
       {/* MonacoViewer panel — desktop: side-by-side, mobile: full-width */}
-      <div className={`${!mobileShowTree ? 'block' : 'hidden'} md:block flex-1 h-full`}>
+      <div
+        className={`${!mobileShowTree ? 'block' : 'hidden'} h-full min-h-0 min-w-0 flex-1 overflow-hidden md:block`}
+      >
         <MonacoViewer value={displayValue} language={displayLanguage} height="100%" />
       </div>
 
diff --git a/apps/dashboard/src/components/ResultTable.tsx b/apps/dashboard/src/components/ResultTable.tsx
index 4660f3aea..37b577ad8 100644
--- a/apps/dashboard/src/components/ResultTable.tsx
+++ b/apps/dashboard/src/components/ResultTable.tsx
@@ -7,24 +7,25 @@
  */
 
 import type React from 'react';
-import { useEffect, useMemo, useState } from 'react';
-
-import { Link } from '@tanstack/react-router';
+import { Fragment, useEffect, useMemo, useState } from 'react';
 
 import { useFeedback } from '~/lib/api';
 import {
   RESULT_TABLE_VIEW_PRESETS,
+  type RepeatRunGroup,
   type ResultTableColumn,
   type ResultTableRow,
   type ResultTableState,
   type ResultTableStateInput,
   buildResultTableModel,
 } from '~/lib/result-table';
-import type { EvalResult, ScoreEntry } from '~/lib/types';
+import type { EvalCaseTrial, EvalResult, ScoreEntry } from '~/lib/types';
 
 import { EvalDetail } from './EvalDetail';
 import { PassRatePill } from './PassRatePill';
 
+type DetailTab = 'checks' | 'transcript' | 'source' | 'files' | 'feedback';
+
 interface ResultTableProps {
   results: readonly EvalResult[];
   runId: string;
@@ -44,6 +45,9 @@ const QUERY_KEYS = {
   detail: 'results_detail',
 } as const;
 
+const CHECK_MARK = '\u2713';
+const CROSS_MARK = '\u2717';
+
 function readUrlState(): ResultTableStateInput {
   if (typeof window === 'undefined') return {};
   const params = new URLSearchParams(window.location.search);
@@ -132,6 +136,24 @@ function formatTokens(tokens: number | undefined): string | undefined {
   return `${tokens} tok`;
 }
 
+function tokenUsageTotal(
+  usage: EvalCaseTrial['token_usage'] | EvalResult['tokenUsage'],
+): number | undefined {
+  if (!usage) return undefined;
+  const values = [usage.input, usage.output, usage.reasoning, usage.cached].filter(
+    (value): value is number => typeof value === 'number' && Number.isFinite(value),
+  );
+  return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : undefined;
+}
+
+function caseTrialTokenTotal(trial: EvalCaseTrial): number | undefined {
+  return trial.total_tokens ?? tokenUsageTotal(trial.token_usage);
+}
+
+function caseTrialPath(trial: EvalCaseTrial, index = 0): string {
+  return trial.run_path ?? `run-${trial.attempt ?? index + 1}`;
+}
+
 function compactTokenBreakdown(result: EvalResult): string | undefined {
   const usage = result.tokenUsage;
   if (!usage) return undefined;
@@ -166,6 +188,12 @@ export function ResultTable({
 }: ResultTableProps) {
   const [urlState, setUrlState] = useState<ResultTableStateInput>(() => readUrlState());
   const [selectedRowKey, setSelectedRowKey] = useState<string | null>(() => readSelectedRowKey());
+  const [selectedTrialPath, setSelectedTrialPath] = useState<string | null>(null);
+  const [selectedDetailFilePath, setSelectedDetailFilePath] = useState<string | null>(null);
+  const [selectedDetailTab, setSelectedDetailTab] = useState<DetailTab>('checks');
+  const [collapsedRepeatRows, setCollapsedRepeatRows] = useState<ReadonlySet<string>>(
+    () => new Set(),
+  );
   const { data: feedback } = useFeedback(projectId);
   const reviewedTestIds = useMemo(
     () => feedback?.reviews.map((review) => review.test_id) ?? [],
@@ -186,11 +214,25 @@ export function ResultTable({
     selectedRowKey != null
       ? (model.filteredRows.find((row) => row.key === selectedRowKey) ?? null)
       : null;
+  const repeatGroupsByRowKey = useMemo(
+    () => new Map(model.repeatGroups.map((group) => [group.row.key, group])),
+    [model.repeatGroups],
+  );
+  const selectedRepeatGroup = selectedRow ? repeatGroupsByRowKey.get(selectedRow.key) : undefined;
+  const selectedTrial =
+    selectedRepeatGroup && selectedTrialPath
+      ? (selectedRepeatGroup.trials.find(
+          (trial, index) => caseTrialPath(trial, index) === selectedTrialPath,
+        ) ?? null)
+      : null;
 
   useEffect(() => {
     const handlePopState = () => {
       setUrlState(readUrlState());
       setSelectedRowKey(readSelectedRowKey());
+      setSelectedTrialPath(null);
+      setSelectedDetailFilePath(null);
+      setSelectedDetailTab('checks');
     };
     window.addEventListener('popstate', handlePopState);
     return () => window.removeEventListener('popstate', handlePopState);
@@ -219,6 +261,9 @@ export function ResultTable({
     window.history.replaceState(window.history.state, '', nextUrl);
     setUrlState({});
     setSelectedRowKey(null);
+    setSelectedTrialPath(null);
+    setSelectedDetailFilePath(null);
+    setSelectedDetailTab('checks');
   }
 
   function toggleColumn(columnId: string) {
@@ -231,11 +276,34 @@ export function ResultTable({
   function openRowDetail(rowKey: string) {
     writeSelectedRowKey(rowKey);
     setSelectedRowKey(rowKey);
+    setSelectedTrialPath(null);
+    setSelectedDetailFilePath(null);
+    setSelectedDetailTab('checks');
+  }
+
+  function openTrialDetail(rowKey: string, trial: EvalCaseTrial, initialTab: DetailTab = 'checks') {
+    writeSelectedRowKey(rowKey);
+    setSelectedRowKey(rowKey);
+    setSelectedTrialPath(caseTrialPath(trial));
+    setSelectedDetailTab(initialTab);
+    setSelectedDetailFilePath(primaryTrialArtifactPath(trial));
   }
 
   function closeRowDetail() {
     writeSelectedRowKey(null);
     setSelectedRowKey(null);
+    setSelectedTrialPath(null);
+    setSelectedDetailFilePath(null);
+    setSelectedDetailTab('checks');
+  }
+
+  function toggleRepeatGroup(rowKey: string) {
+    setCollapsedRepeatRows((current) => {
+      const next = new Set(current);
+      if (next.has(rowKey)) next.delete(rowKey);
+      else next.add(rowKey);
+      return next;
+    });
   }
 
   if (results.length === 0) {
@@ -370,58 +438,18 @@ export function ResultTable({
               </p>
             </div>
           ) : (
-            <div className="max-w-full overflow-x-auto rounded-lg border border-gray-800">
-              <table
-                className="w-full whitespace-nowrap text-left text-sm"
-                style={{ minWidth: `${Math.max(860, model.visibleColumns.length * 136)}px` }}
-              >
-                <thead className="border-b border-gray-800 bg-gray-900/50">
-                  <tr>
-                    {model.visibleColumns.map((column) => (
-                      <th
-                        key={column.id}
-                        className={`px-4 py-3 font-medium text-gray-400 ${
-                          isNumericColumn(column.id) ? 'text-right' : ''
-                        }`}
-                        title={column.label}
-                      >
-                        <span className="block max-w-48 truncate">{column.label}</span>
-                      </th>
-                    ))}
-                  </tr>
-                </thead>
-                <tbody className="divide-y divide-gray-800/50">
-                  {model.filteredRows.map((row) => {
-                    const isSelected = selectedRowKey === row.key;
-                    return (
-                      <tr
-                        key={row.key}
-                        className={`transition-colors ${
-                          isSelected ? 'bg-cyan-950/20' : 'hover:bg-gray-900/30'
-                        }`}
-                      >
-                        {model.visibleColumns.map((column) => (
-                          <td
-                            key={`${row.key}:${column.id}`}
-                            className={`px-4 py-3 align-middle ${
-                              isNumericColumn(column.id) ? 'text-right tabular-nums' : ''
-                            }`}
-                          >
-                            <ResultCell
-                              column={column}
-                              row={row}
-                              passThreshold={passThreshold}
-                              onOpenDetail={openRowDetail}
-                              isSelected={isSelected}
-                            />
-                          </td>
-                        ))}
-                      </tr>
-                    );
-                  })}
-                </tbody>
-              </table>
-            </div>
+            <ResultRowsTable
+              rows={model.filteredRows}
+              visibleColumns={model.visibleColumns}
+              passThreshold={passThreshold}
+              selectedRowKey={selectedRowKey}
+              selectedTrialPath={selectedTrialPath}
+              repeatGroupsByRowKey={repeatGroupsByRowKey}
+              collapsedRepeatRows={collapsedRepeatRows}
+              onToggleRepeatGroup={toggleRepeatGroup}
+              onOpenDetail={openRowDetail}
+              onOpenTrialDetail={openTrialDetail}
+            />
           )}
         </div>
 
@@ -430,6 +458,14 @@ export function ResultTable({
             row={selectedRow}
             runId={runId}
             projectId={projectId}
+            repeatGroup={selectedRepeatGroup}
+            selectedTrial={selectedTrial}
+            selectedTrialPath={selectedTrialPath}
+            initialTab={selectedDetailTab}
+            initialFilePath={selectedDetailFilePath}
+            onOpenTrialDetail={(trial, initialTab) =>
+              openTrialDetail(selectedRow.key, trial, initialTab)
+            }
             onClose={closeRowDetail}
           />
         )}
@@ -438,22 +474,326 @@ export function ResultTable({
   );
 }
 
+function ResultRowsTable({
+  rows,
+  visibleColumns,
+  passThreshold,
+  selectedRowKey,
+  selectedTrialPath,
+  repeatGroupsByRowKey,
+  collapsedRepeatRows,
+  onToggleRepeatGroup,
+  onOpenDetail,
+  onOpenTrialDetail,
+}: {
+  rows: readonly ResultTableRow[];
+  visibleColumns: readonly ResultTableColumn[];
+  passThreshold: number;
+  selectedRowKey: string | null;
+  selectedTrialPath: string | null;
+  repeatGroupsByRowKey: ReadonlyMap<string, RepeatRunGroup>;
+  collapsedRepeatRows: ReadonlySet<string>;
+  onToggleRepeatGroup: (rowKey: string) => void;
+  onOpenDetail: (rowKey: string) => void;
+  onOpenTrialDetail: (rowKey: string, trial: EvalCaseTrial) => void;
+}) {
+  return (
+    <div className="max-w-full overflow-x-auto rounded-lg border border-gray-800">
+      <table
+        className="w-full whitespace-nowrap text-left text-sm"
+        style={{ minWidth: `${resultTableMinWidth(visibleColumns)}px` }}
+      >
+        <thead className="border-b border-gray-800 bg-gray-900/50">
+          <tr>
+            {visibleColumns.map((column) => (
+              <th key={column.id} className={columnHeaderClassName(column.id)} title={column.label}>
+                <span
+                  className={
+                    isVisuallyHiddenHeader(column.id) ? 'sr-only' : 'block max-w-48 truncate'
+                  }
+                >
+                  {column.label}
+                </span>
+              </th>
+            ))}
+          </tr>
+        </thead>
+        <tbody className="divide-y divide-gray-800/50">
+          {rows.map((row) => {
+            const repeatGroup = repeatGroupsByRowKey.get(row.key);
+            const isSelected = selectedRowKey === row.key && !selectedTrialPath;
+            const collapsed = repeatGroup ? collapsedRepeatRows.has(row.key) : true;
+            return (
+              <Fragment key={row.key}>
+                <tr
+                  className={`cursor-pointer transition-colors ${
+                    isSelected ? 'bg-cyan-950/20' : 'hover:bg-gray-900/30'
+                  }`}
+                  onClick={() => onOpenDetail(row.key)}
+                  onKeyDown={(event) => {
+                    if (event.key === 'Enter' || event.key === ' ') {
+                      event.preventDefault();
+                      onOpenDetail(row.key);
+                    }
+                  }}
+                  tabIndex={0}
+                  aria-selected={isSelected}
+                >
+                  {visibleColumns.map((column) => (
+                    <td
+                      key={`${row.key}:${column.id}`}
+                      className={columnCellClassName(column.id, 'py-3')}
+                    >
+                      <ResultCell
+                        column={column}
+                        row={row}
+                        repeatGroup={repeatGroup}
+                        repeatCollapsed={collapsed}
+                        passThreshold={passThreshold}
+                        isSelected={isSelected}
+                        onToggleRepeatGroup={onToggleRepeatGroup}
+                      />
+                    </td>
+                  ))}
+                </tr>
+                {repeatGroup && !collapsed
+                  ? repeatGroup.trials.map((trial, index) => {
+                      const trialPath = caseTrialPath(trial, index);
+                      const trialSelected =
+                        selectedRowKey === row.key && selectedTrialPath === trialPath;
+                      return (
+                        <tr
+                          key={`${row.key}:${trialPath}`}
+                          className={`cursor-pointer bg-gray-950/40 transition-colors ${
+                            trialSelected ? 'bg-cyan-950/20' : 'hover:bg-gray-900/50'
+                          }`}
+                          onClick={() => onOpenTrialDetail(row.key, trial)}
+                          onKeyDown={(event) => {
+                            if (event.key === 'Enter' || event.key === ' ') {
+                              event.preventDefault();
+                              onOpenTrialDetail(row.key, trial);
+                            }
+                          }}
+                          tabIndex={0}
+                          aria-selected={trialSelected}
+                        >
+                          {visibleColumns.map((column) => (
+                            <td
+                              key={`${row.key}:${trialPath}:${column.id}`}
+                              className={columnCellClassName(column.id, 'py-2')}
+                            >
+                              <TrialResultCell
+                                column={column}
+                                row={row}
+                                trial={trial}
+                                index={index}
+                                passThreshold={passThreshold}
+                              />
+                            </td>
+                          ))}
+                        </tr>
+                      );
+                    })
+                  : null}
+              </Fragment>
+            );
+          })}
+        </tbody>
+      </table>
+    </div>
+  );
+}
+
+function caseTrialPassed(trial: EvalCaseTrial, passThreshold: number): boolean {
+  if (trial.verdict === 'pass') return true;
+  if (trial.verdict === 'fail') return false;
+  return typeof trial.score === 'number' ? trial.score >= passThreshold : false;
+}
+
+function primaryTrialArtifactPath(trial: EvalCaseTrial): string | null {
+  return (
+    trial.grading_path ??
+    trial.metrics_path ??
+    trial.timing_path ??
+    trial.transcript_path ??
+    trial.answer_path ??
+    null
+  );
+}
+
+function TrialResultCell({
+  column,
+  row,
+  trial,
+  index,
+  passThreshold,
+}: {
+  column: ResultTableColumn;
+  row: ResultTableRow;
+  trial: EvalCaseTrial;
+  index: number;
+  passThreshold: number;
+}) {
+  const passed = caseTrialPassed(trial, passThreshold);
+  const isExecutionError = trial.execution_status === 'execution_error';
+  const status = isExecutionError ? 'error' : passed ? 'passing' : 'failing';
+  const statusLabel = isExecutionError ? 'Error' : passed ? 'Passing' : 'Failing';
+  const label = caseTrialPath(trial, index);
+
+  switch (column.id) {
+    case 'status':
+      return <ResultStatusSymbol status={status} label={statusLabel} />;
+    case 'expander':
+      return <span aria-hidden="true" className="block h-5" />;
+    case 'test':
+      return <TrialTestCell label={label} trial={trial} />;
+    case 'target':
+      return <TargetCell target={row.targetLabel} tone="text-gray-500" />;
+    case 'score':
+      return <PassRatePill rate={trial.score ?? 0} />;
+    case 'suite':
+      return <TruncatedMuted value={row.suiteLabel} tone="text-gray-500" />;
+    case 'category':
+      return <TruncatedMuted value={row.categoryLabel} tone="text-gray-500" />;
+    case 'duration':
+      return <span className="text-gray-500">{formatDuration(trial.duration_ms)}</span>;
+    case 'cost_tokens':
+      return <TrialCostTokenCell trial={trial} />;
+    case 'review':
+      return <span className="text-gray-700">-</span>;
+    case 'error':
+      return <TruncatedMuted value={trial.error} tone="text-red-300" />;
+    default:
+      return <span className="text-gray-700">-</span>;
+  }
+}
+
+function TrialTestCell({ label, trial }: { label: string; trial: EvalCaseTrial }) {
+  return (
+    <div className="max-w-[24rem] min-w-0 pl-6">
+      <div className="truncate font-medium text-gray-300" title={label}>
+        {label}
+      </div>
+      <div className="mt-0.5 flex min-w-0 flex-wrap gap-x-3 gap-y-0.5 text-xs text-gray-600">
+        {trial.total_tool_calls != null ? <span>{trial.total_tool_calls} tool calls</span> : null}
+      </div>
+      {trial.error ? (
+        <div className="mt-0.5 truncate text-xs text-red-300" title={trial.error}>
+          {trial.error}
+        </div>
+      ) : null}
+    </div>
+  );
+}
+
+function ResultStatusSymbol({ status, label }: { status: string; label: string }) {
+  const passing = status === 'passing';
+  const warning = status === 'error' || status === 'partial';
+  const symbol = passing ? CHECK_MARK : CROSS_MARK;
+  const tone = passing ? 'text-emerald-300' : warning ? 'text-amber-300' : 'text-red-300';
+  return (
+    <span className={`inline-flex text-base font-semibold ${tone}`} title={label}>
+      {symbol}
+    </span>
+  );
+}
+
+function RepeatStatusCell({
+  group,
+  passThreshold,
+}: {
+  group: RepeatRunGroup;
+  passThreshold: number;
+}) {
+  const passesThreshold = group.passRate >= passThreshold;
+  const status = passesThreshold ? 'passing' : group.passedTrials > 0 ? 'partial' : 'failing';
+  return (
+    <ResultStatusSymbol
+      status={status}
+      label={`${group.passedTrials}/${group.trialCount} runs passed`}
+    />
+  );
+}
+
+function RepeatSummaryText({ group }: { group: RepeatRunGroup }) {
+  const parts = [
+    `${group.trialCount} runs`,
+    `${formatPercent(group.passRate)} run success`,
+    `${formatPercent(group.meanScore)} mean score`,
+    group.assertionPassRate != null
+      ? `${formatPercent(group.assertionPassRate)} assertions (${group.passedAssertions}/${group.assertionCount})`
+      : undefined,
+    group.totalToolCalls != null ? `${group.totalToolCalls} tool calls` : undefined,
+    group.artifactCount > 0 ? `${group.artifactCount} artifacts` : undefined,
+  ].filter((part): part is string => Boolean(part));
+  return (
+    <div className="mt-0.5 truncate text-xs text-gray-500" title={parts.join(' · ')}>
+      {parts.join(' · ')}
+    </div>
+  );
+}
+
+function RepeatScoreCell({ group }: { group: RepeatRunGroup }) {
+  return <PassRatePill rate={group.meanScore} />;
+}
+
+function RepeatDurationCell({ group, row }: { group: RepeatRunGroup; row: ResultTableRow }) {
+  return (
+    <span className="text-gray-400">
+      {formatDuration(group.meanDurationMs ?? row.result.durationMs)}
+    </span>
+  );
+}
+
 function isNumericColumn(columnId: string): boolean {
   return ['duration', 'cost_tokens'].includes(columnId) || columnId.startsWith('grader:');
 }
 
+function isCompactColumn(columnId: string): boolean {
+  return columnId === 'status' || columnId === 'expander';
+}
+
+function isVisuallyHiddenHeader(columnId: string): boolean {
+  return isCompactColumn(columnId);
+}
+
+function resultTableMinWidth(columns: readonly ResultTableColumn[]): number {
+  const width = columns.reduce((sum, column) => sum + (isCompactColumn(column.id) ? 44 : 136), 0);
+  return Math.max(760, width);
+}
+
+function columnHeaderClassName(columnId: string): string {
+  if (isCompactColumn(columnId)) {
+    return 'w-11 min-w-11 max-w-11 px-2 py-3 text-center font-medium text-gray-400';
+  }
+  return `px-4 py-3 font-medium text-gray-400 ${isNumericColumn(columnId) ? 'text-right' : ''}`;
+}
+
+function columnCellClassName(columnId: string, paddingY: 'py-2' | 'py-3'): string {
+  if (isCompactColumn(columnId)) {
+    return `w-11 min-w-11 max-w-11 px-2 ${paddingY} text-center align-middle`;
+  }
+  return `px-4 ${paddingY} align-middle ${
+    isNumericColumn(columnId) ? 'text-right tabular-nums' : ''
+  }`;
+}
+
 function ResultCell({
   column,
   row,
+  repeatGroup,
+  repeatCollapsed,
   passThreshold,
-  onOpenDetail,
   isSelected,
+  onToggleRepeatGroup,
 }: {
   column: ResultTableColumn;
   row: ReturnType<typeof buildResultTableModel>['filteredRows'][number];
+  repeatGroup?: RepeatRunGroup;
+  repeatCollapsed: boolean;
   passThreshold: number;
-  onOpenDetail: (rowKey: string) => void;
   isSelected: boolean;
+  onToggleRepeatGroup: (rowKey: string) => void;
 }) {
   if (column.id.startsWith('grader:')) {
     const graderName = column.id.slice('grader:'.length);
@@ -464,16 +804,32 @@ function ResultCell({
 
   switch (column.id) {
     case 'status':
-      return <StatusCell status={row.status} label={row.statusLabel} />;
+      return repeatGroup ? (
+        <RepeatStatusCell group={repeatGroup} passThreshold={passThreshold} />
+      ) : (
+        <ResultStatusSymbol status={row.status} label={row.statusLabel} />
+      );
+    case 'expander':
+      return repeatGroup ? (
+        <ExpanderCell
+          row={row}
+          repeatCollapsed={repeatCollapsed}
+          onToggleRepeatGroup={onToggleRepeatGroup}
+        />
+      ) : (
+        <span aria-hidden="true" className="block h-5" />
+      );
     case 'test':
-      return <TestCell row={row} onOpenDetail={onOpenDetail} isSelected={isSelected} />;
-    case 'model_target':
-      return <ModelTargetCell row={row} />;
+      return <TestCell row={row} repeatGroup={repeatGroup} isSelected={isSelected} />;
+    case 'target':
+      return <TargetCell target={row.targetLabel} />;
     case 'score':
       return row.executionError ? (
         <span className="inline-flex rounded-md border border-amber-900/60 bg-amber-950/20 px-2 py-0.5 text-xs font-medium text-amber-300">
           Execution error
         </span>
+      ) : repeatGroup ? (
+        <RepeatScoreCell group={repeatGroup} />
       ) : (
         <PassRatePill rate={row.result.score} />
       );
@@ -482,9 +838,13 @@ function ResultCell({
     case 'category':
       return <TruncatedMuted value={row.categoryLabel} />;
     case 'duration':
-      return <span className="text-gray-400">{formatDuration(row.result.durationMs)}</span>;
+      return repeatGroup ? (
+        <RepeatDurationCell group={repeatGroup} row={row} />
+      ) : (
+        <span className="text-gray-400">{formatDuration(row.result.durationMs)}</span>
+      );
     case 'cost_tokens':
-      return <CostTokenCell row={row} />;
+      return <CostTokenCell row={row} repeatGroup={repeatGroup} />;
     case 'review':
       return (
         <span className={row.reviewed ? 'text-emerald-300' : 'text-gray-500'}>
@@ -498,48 +858,26 @@ function ResultCell({
   }
 }
 
-function StatusCell({ status, label }: { status: string; label: string }) {
-  const tone =
-    status === 'passing'
-      ? 'border-emerald-900/60 bg-emerald-950/20 text-emerald-300'
-      : status === 'error'
-        ? 'border-amber-900/60 bg-amber-950/20 text-amber-300'
-        : 'border-red-900/60 bg-red-950/20 text-red-300';
-  const dot =
-    status === 'passing' ? 'bg-emerald-400' : status === 'error' ? 'bg-amber-400' : 'bg-red-400';
-  return (
-    <span
-      className={`inline-flex items-center gap-1.5 rounded-md border px-2 py-0.5 text-xs ${tone}`}
-    >
-      <span className={`h-1.5 w-1.5 rounded-full ${dot}`} />
-      {label}
-    </span>
-  );
-}
-
 function TestCell({
   row,
-  onOpenDetail,
+  repeatGroup,
   isSelected,
 }: {
   row: ReturnType<typeof buildResultTableModel>['filteredRows'][number];
-  onOpenDetail: (rowKey: string) => void;
+  repeatGroup?: RepeatRunGroup;
   isSelected: boolean;
 }) {
-  const className =
-    'block min-w-0 truncate text-left font-medium text-cyan-400 hover:text-cyan-300 hover:underline';
-
   return (
     <div className="max-w-[24rem] min-w-0">
-      <button
-        type="button"
-        onClick={() => onOpenDetail(row.key)}
-        className={className}
+      <span
+        className={`block min-w-0 truncate text-left font-medium ${
+          isSelected ? 'text-cyan-200' : 'text-cyan-400'
+        }`}
         title={row.testId}
-        aria-pressed={isSelected}
       >
         {row.testId}
-      </button>
+      </span>
+      {repeatGroup ? <RepeatSummaryText group={repeatGroup} /> : null}
       {row.result.error ? (
         <div className="mt-0.5 truncate text-xs text-red-300" title={row.result.error}>
           {row.result.error}
@@ -553,20 +891,46 @@ function ResultDetailPanel({
   row,
   runId,
   projectId,
+  repeatGroup,
+  selectedTrial,
+  selectedTrialPath,
+  initialTab,
+  initialFilePath,
+  onOpenTrialDetail,
   onClose,
 }: {
   row: ResultTableRow;
   runId: string;
   projectId?: string;
+  repeatGroup?: RepeatRunGroup;
+  selectedTrial: EvalCaseTrial | null;
+  selectedTrialPath: string | null;
+  initialTab: DetailTab;
+  initialFilePath: string | null;
+  onOpenTrialDetail: (trial: EvalCaseTrial, initialTab?: DetailTab) => void;
   onClose: () => void;
 }) {
+  const evalDetailHref = buildEvalDetailHref({
+    projectId,
+    runId,
+    evalId: row.testId,
+    artifactDir: row.result.artifact_dir,
+  });
+  const title = selectedTrialPath ? `${row.testId} · ${selectedTrialPath}` : row.testId;
+  const showAggregateRepeatDetail = repeatGroup && !selectedTrial;
+  const panelScrollKey = `${row.key}:${selectedTrialPath ?? ''}:${initialTab}`;
+
   return (
-    <aside className="min-w-0 rounded-lg border border-gray-800 bg-gray-950/80 xl:sticky xl:top-4 xl:max-h-[calc(100vh-2rem)]">
+    <aside
+      key={panelScrollKey}
+      ref={scrollPanelIntoView}
+      className="min-w-0 rounded-lg border border-gray-800 bg-gray-950/80 xl:sticky xl:top-4 xl:max-h-[calc(100vh-2rem)]"
+    >
       <div className="flex min-w-0 items-start justify-between gap-3 border-b border-gray-800 px-4 py-3">
         <div className="min-w-0">
           <p className="text-xs font-medium uppercase tracking-wider text-gray-500">Row detail</p>
-          <h4 className="mt-1 truncate text-base font-semibold text-white" title={row.testId}>
-            {row.testId}
+          <h4 className="mt-1 truncate text-base font-semibold text-white" title={title}>
+            {title}
           </h4>
           <p className="mt-1 truncate text-xs text-gray-500" title={row.targetLabel}>
             {row.targetLabel}
@@ -574,23 +938,12 @@ function ResultDetailPanel({
           </p>
         </div>
         <div className="flex shrink-0 items-center gap-2">
-          {projectId ? (
-            <Link
-              to="/projects/$projectId/evals/$runId/$evalId"
-              params={{ projectId, runId, evalId: row.testId }}
-              className="rounded-md border border-gray-800 px-2.5 py-1.5 text-xs text-gray-400 transition-colors hover:border-gray-700 hover:text-gray-200"
-            >
-              Full page
-            </Link>
-          ) : (
-            <Link
-              to="/evals/$runId/$evalId"
-              params={{ runId, evalId: row.testId }}
-              className="rounded-md border border-gray-800 px-2.5 py-1.5 text-xs text-gray-400 transition-colors hover:border-gray-700 hover:text-gray-200"
-            >
-              Full page
-            </Link>
-          )}
+          <a
+            href={evalDetailHref}
+            className="rounded-md border border-gray-800 px-2.5 py-1.5 text-xs text-gray-400 transition-colors hover:border-gray-700 hover:text-gray-200"
+          >
+            Full page
+          </a>
           <button
             type="button"
             onClick={onClose}
@@ -601,38 +954,98 @@ function ResultDetailPanel({
         </div>
       </div>
       <div className="h-[36rem] min-h-[28rem] overflow-hidden xl:h-[calc(100vh-9rem)]">
-        <EvalDetail eval={row.result} runId={runId} projectId={projectId} />
+        <EvalDetail
+          key={`${row.key}:${selectedTrialPath ?? 'aggregate'}:${initialFilePath ?? ''}`}
+          eval={row.result}
+          runId={runId}
+          projectId={projectId}
+          repeatGroup={showAggregateRepeatDetail ? repeatGroup : undefined}
+          selectedTrial={selectedTrial}
+          initialTab={initialTab}
+          initialSelectedFilePath={initialFilePath}
+          onSelectTrial={onOpenTrialDetail}
+        />
       </div>
     </aside>
   );
 }
 
-function ModelTargetCell({
+function TargetCell({ target, tone = 'text-gray-300' }: { target: string; tone?: string }) {
+  return (
+    <div className={`max-w-[14rem] truncate ${tone}`} title={target}>
+      {target}
+    </div>
+  );
+}
+
+function ExpanderCell({
   row,
+  repeatCollapsed,
+  onToggleRepeatGroup,
 }: {
-  row: ReturnType<typeof buildResultTableModel>['filteredRows'][number];
+  row: ResultTableRow;
+  repeatCollapsed: boolean;
+  onToggleRepeatGroup: (rowKey: string) => void;
 }) {
   return (
-    <div className="max-w-[16rem] min-w-0">
-      <div className="truncate text-gray-300" title={row.targetLabel}>
-        {row.targetLabel}
-      </div>
-      {row.modelLabel ? (
-        <div className="mt-0.5 truncate text-xs text-gray-500" title={row.modelLabel}>
-          {row.modelLabel}
-        </div>
-      ) : null}
-    </div>
+    <button
+      type="button"
+      onClick={(event) => {
+        event.stopPropagation();
+        onToggleRepeatGroup(row.key);
+      }}
+      className="inline-flex h-6 w-6 items-center justify-center rounded-md border border-gray-800 text-xs text-gray-400 transition-colors hover:border-gray-700 hover:text-gray-200"
+      aria-expanded={!repeatCollapsed}
+      aria-label={`${repeatCollapsed ? 'Expand' : 'Collapse'} ${row.testId}`}
+    >
+      {repeatCollapsed ? '+' : '-'}
+    </button>
   );
 }
 
+function buildEvalDetailHref(options: {
+  projectId?: string;
+  runId: string;
+  evalId: string;
+  artifactDir?: string;
+}): string {
+  const base = options.projectId
+    ? `/projects/${encodeURIComponent(options.projectId)}/evals/${encodeURIComponent(options.runId)}/${encodeURIComponent(options.evalId)}`
+    : `/evals/${encodeURIComponent(options.runId)}/${encodeURIComponent(options.evalId)}`;
+  if (!options.artifactDir) return base;
+  return `${base}?artifact_dir=${encodeURIComponent(options.artifactDir)}`;
+}
+
+function scrollPanelIntoView(panel: HTMLElement | null) {
+  if (!panel) return;
+  window.requestAnimationFrame(() => {
+    panel.scrollIntoView({ block: 'nearest', inline: 'nearest' });
+  });
+}
+
 function CostTokenCell({
   row,
+  repeatGroup,
 }: {
   row: ReturnType<typeof buildResultTableModel>['filteredRows'][number];
+  repeatGroup?: RepeatRunGroup;
 }) {
-  const cost = formatCost(row.result.costUsd);
-  const tokens = formatTokens(row.tokenTotal);
+  const repeatCosts =
+    repeatGroup?.trials
+      .map((trial) => trial.cost_usd)
+      .filter((value): value is number => typeof value === 'number' && Number.isFinite(value)) ??
+    [];
+  const repeatTokens =
+    repeatGroup?.trials
+      .map(caseTrialTokenTotal)
+      .filter((value): value is number => typeof value === 'number' && Number.isFinite(value)) ??
+    [];
+  const repeatCostTotal =
+    repeatCosts.length > 0 ? repeatCosts.reduce((sum, value) => sum + value, 0) : undefined;
+  const repeatTokenTotal =
+    repeatTokens.length > 0 ? repeatTokens.reduce((sum, value) => sum + value, 0) : undefined;
+  const cost = formatCost(row.result.costUsd ?? repeatCostTotal);
+  const tokens = formatTokens(row.tokenTotal ?? repeatTokenTotal);
   const breakdown = compactTokenBreakdown(row.result);
   if (!cost && !tokens) return <span className="text-gray-600">-</span>;
 
@@ -648,6 +1061,18 @@ function CostTokenCell({
   );
 }
 
+function TrialCostTokenCell({ trial }: { trial: EvalCaseTrial }) {
+  const cost = formatCost(trial.cost_usd);
+  const tokens = formatTokens(caseTrialTokenTotal(trial));
+  if (!cost && !tokens) return <span className="text-gray-700">-</span>;
+  return (
+    <div className="min-w-0 text-right">
+      {cost ? <div className="tabular-nums text-gray-500">{cost}</div> : null}
+      {tokens ? <div className="text-xs tabular-nums text-gray-600">{tokens}</div> : null}
+    </div>
+  );
+}
+
 function GraderScoreCell({
   score,
   passThreshold,
diff --git a/apps/dashboard/src/components/ResumeRunActions.tsx b/apps/dashboard/src/components/ResumeRunActions.tsx
index b92324446..3ef4f5e27 100644
--- a/apps/dashboard/src/components/ResumeRunActions.tsx
+++ b/apps/dashboard/src/components/ResumeRunActions.tsx
@@ -64,7 +64,7 @@ export function ResumeRunActions({
   const disabledReason = !runDir
     ? 'Run directory unavailable (remote run cannot be resumed in place)'
     : !suiteFilter
-      ? 'Original eval file path missing from benchmark.json — cannot determine what to resume'
+      ? 'Original eval file path missing from summary.json — cannot determine what to resume'
       : '';
 
   async function launch(mode: ResumeMode) {
diff --git a/apps/dashboard/src/components/resume-run-helpers.ts b/apps/dashboard/src/components/resume-run-helpers.ts
index 1736adf33..c8cd881fc 100644
--- a/apps/dashboard/src/components/resume-run-helpers.ts
+++ b/apps/dashboard/src/components/resume-run-helpers.ts
@@ -31,8 +31,8 @@ export interface BuildResumeRequestParams {
  * Case 2 covers Stop-button / Ctrl+C interruptions where the run produced
  * only successful rows before being killed: there is no `execution_error`
  * to anchor on, but the run is still resumable. `plannedTestCount` is
- * persisted in `benchmark.json.metadata` at run start (see
- * `writeInitialBenchmarkArtifact`).
+ * persisted in `summary.json.metadata` at run start (see
+ * `writeInitialRunSummaryArtifact`).
  *
  * Hidden in read-only mode — the server also returns 403, but UI-level
  * hiding avoids dead controls.
diff --git a/apps/dashboard/src/lib/api.ts b/apps/dashboard/src/lib/api.ts
index becd3c466..bdbc59c82 100644
--- a/apps/dashboard/src/lib/api.ts
+++ b/apps/dashboard/src/lib/api.ts
@@ -99,22 +99,33 @@ function encodeArtifactPath(filePath: string): string {
     .join('/');
 }
 
+function withQueryParams(base: string, params: URLSearchParams): string {
+  const query = params.toString();
+  return query ? `${base}?${query}` : base;
+}
+
+function evalArtifactParams(artifactDir?: string): URLSearchParams {
+  const params = new URLSearchParams();
+  if (artifactDir) params.set('artifact_dir', artifactDir);
+  return params;
+}
+
 export function artifactFileContentUrl(options: {
   runId: string;
   evalId: string;
   filePath: string;
   projectId?: string;
+  artifactDir?: string;
   raw?: boolean;
   download?: boolean;
 }): string {
   const base = options.projectId
     ? `${projectApiBase(options.projectId)}/runs/${encodeURIComponent(options.runId)}/evals/${encodeURIComponent(options.evalId)}/files/${encodeArtifactPath(options.filePath)}`
     : `/api/runs/${encodeURIComponent(options.runId)}/evals/${encodeURIComponent(options.evalId)}/files/${encodeArtifactPath(options.filePath)}`;
-  const params = new URLSearchParams();
+  const params = evalArtifactParams(options.artifactDir);
   if (options.raw) params.set('raw', '1');
   if (options.download) params.set('download', '1');
-  const query = params.toString();
-  return query ? `${base}?${query}` : base;
+  return withQueryParams(base, params);
 }
 
 export const runListOptions = queryOptions({
@@ -164,13 +175,12 @@ export function runSuitesOptions(runId: string) {
   });
 }
 
-export function evalDetailOptions(runId: string, evalId: string) {
+export function evalDetailOptions(runId: string, evalId: string, artifactDir?: string) {
+  const base = `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}`;
   return queryOptions({
-    queryKey: ['runs', runId, 'evals', evalId],
+    queryKey: ['runs', runId, 'evals', evalId, artifactDir ?? ''],
     queryFn: () =>
-      fetchJson<EvalDetailResponse>(
-        `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}`,
-      ),
+      fetchJson<EvalDetailResponse>(withQueryParams(base, evalArtifactParams(artifactDir))),
     enabled: !!runId && !!evalId,
   });
 }
@@ -212,33 +222,38 @@ export const targetsOptions = queryOptions({
   queryFn: () => fetchJson<TargetsResponse>('/api/targets'),
 });
 
-export function evalFilesOptions(runId: string, evalId: string) {
+export function evalFilesOptions(runId: string, evalId: string, artifactDir?: string) {
+  const base = `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/files`;
   return queryOptions({
-    queryKey: ['runs', runId, 'evals', evalId, 'files'],
+    queryKey: ['runs', runId, 'evals', evalId, artifactDir ?? '', 'files'],
     queryFn: () =>
-      fetchJson<FileTreeResponse>(
-        `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/files`,
-      ),
+      fetchJson<FileTreeResponse>(withQueryParams(base, evalArtifactParams(artifactDir))),
     enabled: !!runId && !!evalId,
   });
 }
 
-export function evalFileContentOptions(runId: string, evalId: string, filePath: string) {
+export function evalFileContentOptions(
+  runId: string,
+  evalId: string,
+  filePath: string,
+  artifactDir?: string,
+) {
   return queryOptions({
-    queryKey: ['runs', runId, 'evals', evalId, 'files', filePath],
+    queryKey: ['runs', runId, 'evals', evalId, artifactDir ?? '', 'files', filePath],
     queryFn: () =>
-      fetchJson<FileContentResponse>(artifactFileContentUrl({ runId, evalId, filePath })),
+      fetchJson<FileContentResponse>(
+        artifactFileContentUrl({ runId, evalId, filePath, artifactDir }),
+      ),
     enabled: !!runId && !!evalId && !!filePath,
   });
 }
 
-export function evalTranscriptOptions(runId: string, evalId: string) {
+export function evalTranscriptOptions(runId: string, evalId: string, artifactDir?: string) {
+  const base = `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`;
   return queryOptions({
-    queryKey: ['runs', runId, 'evals', evalId, 'transcript'],
+    queryKey: ['runs', runId, 'evals', evalId, artifactDir ?? '', 'transcript'],
     queryFn: () =>
-      fetchJson<TranscriptArtifactResponse>(
-        `/api/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`,
-      ),
+      fetchJson<TranscriptArtifactResponse>(withQueryParams(base, evalArtifactParams(artifactDir))),
     enabled: !!runId && !!evalId,
   });
 }
@@ -301,8 +316,8 @@ export function useRunSuites(runId: string) {
   return useQuery(runSuitesOptions(runId));
 }
 
-export function useEvalDetail(runId: string, evalId: string) {
-  return useQuery(evalDetailOptions(runId, evalId));
+export function useEvalDetail(runId: string, evalId: string, artifactDir?: string) {
+  return useQuery(evalDetailOptions(runId, evalId, artifactDir));
 }
 
 export function useIndex() {
@@ -325,16 +340,21 @@ export function useTargets() {
   return useQuery(targetsOptions);
 }
 
-export function useEvalFiles(runId: string, evalId: string) {
-  return useQuery(evalFilesOptions(runId, evalId));
+export function useEvalFiles(runId: string, evalId: string, artifactDir?: string) {
+  return useQuery(evalFilesOptions(runId, evalId, artifactDir));
 }
 
-export function useEvalFileContent(runId: string, evalId: string, filePath: string) {
-  return useQuery(evalFileContentOptions(runId, evalId, filePath));
+export function useEvalFileContent(
+  runId: string,
+  evalId: string,
+  filePath: string,
+  artifactDir?: string,
+) {
+  return useQuery(evalFileContentOptions(runId, evalId, filePath, artifactDir));
 }
 
-export function useEvalTranscript(runId: string, evalId: string) {
-  return useQuery(evalTranscriptOptions(runId, evalId));
+export function useEvalTranscript(runId: string, evalId: string, artifactDir?: string) {
+  return useQuery(evalTranscriptOptions(runId, evalId, artifactDir));
 }
 
 export function useRunCategories(runId: string) {
@@ -531,24 +551,32 @@ export function projectCategorySuitesOptions(projectId: string, runId: string, c
   });
 }
 
-export function projectEvalDetailOptions(projectId: string, runId: string, evalId: string) {
+export function projectEvalDetailOptions(
+  projectId: string,
+  runId: string,
+  evalId: string,
+  artifactDir?: string,
+) {
+  const base = `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}`;
   return queryOptions({
-    queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId],
+    queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId, artifactDir ?? ''],
     queryFn: () =>
-      fetchJson<EvalDetailResponse>(
-        `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}`,
-      ),
+      fetchJson<EvalDetailResponse>(withQueryParams(base, evalArtifactParams(artifactDir))),
     enabled: !!projectId && !!runId && !!evalId,
   });
 }
 
-export function projectEvalFilesOptions(projectId: string, runId: string, evalId: string) {
+export function projectEvalFilesOptions(
+  projectId: string,
+  runId: string,
+  evalId: string,
+  artifactDir?: string,
+) {
+  const base = `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/files`;
   return queryOptions({
-    queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId, 'files'],
+    queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId, artifactDir ?? '', 'files'],
     queryFn: () =>
-      fetchJson<FileTreeResponse>(
-        `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/files`,
-      ),
+      fetchJson<FileTreeResponse>(withQueryParams(base, evalArtifactParams(artifactDir))),
     enabled: !!projectId && !!runId && !!evalId,
   });
 }
@@ -558,24 +586,48 @@ export function projectEvalFileContentOptions(
   runId: string,
   evalId: string,
   filePath: string,
+  artifactDir?: string,
 ) {
   return queryOptions({
-    queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId, 'files', filePath],
+    queryKey: [
+      'projects',
+      projectId,
+      'runs',
+      runId,
+      'evals',
+      evalId,
+      artifactDir ?? '',
+      'files',
+      filePath,
+    ],
     queryFn: () =>
       fetchJson<FileContentResponse>(
-        artifactFileContentUrl({ projectId, runId, evalId, filePath }),
+        artifactFileContentUrl({ projectId, runId, evalId, filePath, artifactDir }),
       ),
     enabled: !!projectId && !!runId && !!evalId && !!filePath,
   });
 }
 
-export function projectEvalTranscriptOptions(projectId: string, runId: string, evalId: string) {
+export function projectEvalTranscriptOptions(
+  projectId: string,
+  runId: string,
+  evalId: string,
+  artifactDir?: string,
+) {
+  const base = `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`;
   return queryOptions({
-    queryKey: ['projects', projectId, 'runs', runId, 'evals', evalId, 'transcript'],
+    queryKey: [
+      'projects',
+      projectId,
+      'runs',
+      runId,
+      'evals',
+      evalId,
+      artifactDir ?? '',
+      'transcript',
+    ],
     queryFn: () =>
-      fetchJson<TranscriptArtifactResponse>(
-        `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/evals/${encodeURIComponent(evalId)}/transcript`,
-      ),
+      fetchJson<TranscriptArtifactResponse>(withQueryParams(base, evalArtifactParams(artifactDir))),
     enabled: !!projectId && !!runId && !!evalId,
   });
 }
diff --git a/apps/dashboard/src/lib/result-table.test.ts b/apps/dashboard/src/lib/result-table.test.ts
index 78a50de91..6c398fecc 100644
--- a/apps/dashboard/src/lib/result-table.test.ts
+++ b/apps/dashboard/src/lib/result-table.test.ts
@@ -104,9 +104,9 @@ describe('result-table model', () => {
     expect(model.columns.map((column) => column.id)).toEqual([
       'status',
       'test',
-      'model_target',
-      'score',
+      'target',
       'suite',
+      'score',
       'category',
       'duration',
       'cost_tokens',
@@ -116,6 +116,38 @@ describe('result-table model', () => {
     expect(model.visibleColumns.map((column) => column.id)).toContain('grader:correctness');
   });
 
+  it('orders repeat-run columns with target before suite before score', () => {
+    const model = buildResultTableModel({
+      passThreshold: 0.8,
+      results: [
+        result({
+          testId: 'repeat-case',
+          suite: 'strict-layout',
+          target: 'openai',
+          trials: [
+            { attempt: 0, run_path: 'run-1', score: 1, verdict: 'pass' },
+            { attempt: 1, run_path: 'run-2', score: 0.4, verdict: 'fail' },
+          ],
+        }),
+      ],
+    });
+
+    expect(model.columns.map((column) => column.id).slice(0, 6)).toEqual([
+      'status',
+      'expander',
+      'test',
+      'target',
+      'suite',
+      'score',
+    ]);
+    expect(model.repeatGroups).toHaveLength(1);
+    expect(model.repeatGroups[0]).toMatchObject({
+      trialCount: 2,
+      passedTrials: 1,
+      failedTrials: 1,
+    });
+  });
+
   it('accepts legacy scorer URL state as a grader alias', () => {
     const model = buildResultTableModel({
       passThreshold: 0.8,
diff --git a/apps/dashboard/src/lib/result-table.ts b/apps/dashboard/src/lib/result-table.ts
index 626d41f44..6587b04f5 100644
--- a/apps/dashboard/src/lib/result-table.ts
+++ b/apps/dashboard/src/lib/result-table.ts
@@ -7,7 +7,7 @@
  */
 
 import { isExecutionError } from './result-summary';
-import type { EvalResult, ScoreEntry } from './types';
+import type { AssertionEntry, EvalCaseTrial, EvalResult, ScoreEntry } from './types';
 
 export type ResultTableViewId =
   | 'all'
@@ -76,9 +76,27 @@ export interface ResultTableRow {
   readonly searchText: string;
 }
 
+export interface RepeatRunGroup {
+  readonly row: ResultTableRow;
+  readonly trials: readonly EvalCaseTrial[];
+  readonly trialCount: number;
+  readonly passedTrials: number;
+  readonly failedTrials: number;
+  readonly passRate: number;
+  readonly meanScore: number;
+  readonly assertionCount: number;
+  readonly passedAssertions: number;
+  readonly assertionPassRate?: number;
+  readonly meanDurationMs?: number;
+  readonly totalToolCalls?: number;
+  readonly artifactCount: number;
+}
+
 export interface ResultTableModel {
   readonly rows: readonly ResultTableRow[];
   readonly filteredRows: readonly ResultTableRow[];
+  readonly repeatGroups: readonly RepeatRunGroup[];
+  readonly filteredRepeatGroups: readonly RepeatRunGroup[];
   readonly columns: readonly ResultTableColumn[];
   readonly visibleColumns: readonly ResultTableColumn[];
   readonly state: ResultTableState;
@@ -133,6 +151,21 @@ function flattenScoreText(scores: readonly ScoreEntry[] | undefined): string[] {
   return parts.filter((part) => part.length > 0);
 }
 
+function scoreAssertions(scores: readonly ScoreEntry[] | undefined): AssertionEntry[] {
+  if (!scores || scores.length === 0) return [];
+  return scores.flatMap((score) => [...(score.assertions ?? []), ...scoreAssertions(score.scores)]);
+}
+
+function uniqueAssertions(assertions: readonly AssertionEntry[]): AssertionEntry[] {
+  const seen = new Set<string>();
+  return assertions.filter((assertion) => {
+    const key = `${assertion.text}\0${assertion.evidence ?? ''}\0${assertion.passed}`;
+    if (seen.has(key)) return false;
+    seen.add(key);
+    return true;
+  });
+}
+
 function buildGraderMap(
   scores: readonly ScoreEntry[] | undefined,
 ): ReadonlyMap<string, ScoreEntry> {
@@ -156,6 +189,32 @@ function totalTokens(result: EvalResult): number | undefined {
   return values.reduce((sum, value) => sum + value, 0);
 }
 
+function numeric(values: readonly (number | undefined)[]): number[] {
+  return values.filter(
+    (value): value is number => typeof value === 'number' && Number.isFinite(value),
+  );
+}
+
+function caseTrials(result: EvalResult): readonly EvalCaseTrial[] {
+  return result.trials ?? [];
+}
+
+function caseTrialPassed(trial: EvalCaseTrial, passThreshold: number): boolean {
+  if (trial.verdict === 'pass') return true;
+  if (trial.verdict === 'fail') return false;
+  return typeof trial.score === 'number' ? trial.score >= passThreshold : false;
+}
+
+function caseTrialArtifactCount(trial: EvalCaseTrial): number {
+  return [
+    trial.metrics_path,
+    trial.timing_path,
+    trial.grading_path,
+    trial.transcript_path,
+    trial.answer_path,
+  ].filter(Boolean).length;
+}
+
 function modelLabel(result: EvalResult): string | undefined {
   const direct = cleanString(result.model);
   if (direct) return direct;
@@ -228,30 +287,77 @@ function buildRow(
   };
 }
 
-function hasMeaningfulTarget(rows: readonly ResultTableRow[]): boolean {
-  return rows.some((row) => row.targetLabel !== 'default' || row.modelLabel);
+function buildRepeatGroup(row: ResultTableRow, passThreshold: number): RepeatRunGroup | undefined {
+  const trials = caseTrials(row.result).filter((trial) => trial.run_path || trial.verdict);
+  if (trials.length <= 1) return undefined;
+
+  const passedTrials = trials.filter((trial) => caseTrialPassed(trial, passThreshold)).length;
+  const durationValues = numeric(trials.map((trial) => trial.duration_ms));
+  const scoreValues = numeric(trials.map((trial) => trial.score));
+  const toolCallValues = numeric(trials.map((trial) => trial.total_tool_calls));
+  const artifactCount = trials.reduce((sum, trial) => sum + caseTrialArtifactCount(trial), 0);
+  const assertions = uniqueAssertions([
+    ...(row.result.assertions ?? []),
+    ...scoreAssertions(row.result.scores),
+    ...trials.flatMap((trial) => [...(trial.assertions ?? []), ...scoreAssertions(trial.scores)]),
+  ]);
+  const passedAssertions = assertions.filter((assertion) => assertion.passed).length;
+
+  return {
+    row,
+    trials,
+    trialCount: trials.length,
+    passedTrials,
+    failedTrials: trials.length - passedTrials,
+    passRate: trials.length > 0 ? passedTrials / trials.length : 0,
+    meanScore:
+      scoreValues.length > 0
+        ? scoreValues.reduce((sum, value) => sum + value, 0) / scoreValues.length
+        : row.result.score,
+    assertionCount: assertions.length,
+    passedAssertions,
+    ...(assertions.length > 0 && { assertionPassRate: passedAssertions / assertions.length }),
+    ...(durationValues.length > 0 && {
+      meanDurationMs: durationValues.reduce((sum, value) => sum + value, 0) / durationValues.length,
+    }),
+    ...(toolCallValues.length > 0 && {
+      totalToolCalls: toolCallValues.reduce((sum, value) => sum + value, 0),
+    }),
+    artifactCount,
+  };
 }
 
 function buildColumns(rows: readonly ResultTableRow[], graderOptions: readonly string[]) {
+  const hasRepeatRows = rows.some((row) => caseTrials(row.result).length > 1);
   const hasSuite = rows.some((row) => row.suiteLabel);
   const hasCategory = rows.some((row) => row.categoryLabel);
-  const hasDuration = rows.some((row) => row.result.durationMs != null);
-  const hasCostOrTokens = rows.some((row) => row.result.costUsd != null || row.tokenTotal != null);
+  const hasDuration = rows.some(
+    (row) =>
+      row.result.durationMs != null ||
+      caseTrials(row.result).some((trial) => trial.duration_ms != null),
+  );
+  const hasCostOrTokens = rows.some(
+    (row) =>
+      row.result.costUsd != null ||
+      row.tokenTotal != null ||
+      caseTrials(row.result).some(
+        (trial) =>
+          trial.cost_usd != null || trial.total_tokens != null || trial.token_usage != null,
+      ),
+  );
   const hasError = rows.some((row) => row.result.error);
 
   const columns: ResultTableColumn[] = [
     { id: 'status', label: 'Status', kind: 'base', defaultVisible: true },
+    ...(hasRepeatRows
+      ? [{ id: 'expander', label: 'Expand', kind: 'base' as const, defaultVisible: true }]
+      : []),
     { id: 'test', label: 'Test ID', kind: 'base', defaultVisible: true },
-    {
-      id: 'model_target',
-      label: 'Model / Target',
-      kind: 'base',
-      defaultVisible: hasMeaningfulTarget(rows),
-    },
-    { id: 'score', label: 'Score', kind: 'base', defaultVisible: true },
+    { id: 'target', label: 'Target', kind: 'base', defaultVisible: true },
     ...(hasSuite
       ? [{ id: 'suite', label: 'Suite', kind: 'base' as const, defaultVisible: true }]
       : []),
+    { id: 'score', label: 'Score', kind: 'base', defaultVisible: true },
     ...(hasCategory
       ? [{ id: 'category', label: 'Category', kind: 'base' as const, defaultVisible: false }]
       : []),
@@ -295,6 +401,23 @@ function defaultVisibleColumnIds(columns: readonly ResultTableColumn[]): string[
   return defaults.length > 0 ? defaults : columns.slice(0, 4).map((column) => column.id);
 }
 
+function includeStructuralColumn(
+  requestedColumns: readonly string[],
+  columns: readonly ResultTableColumn[],
+): string[] {
+  if (!columns.some((column) => column.id === 'expander')) return [...requestedColumns];
+  if (requestedColumns.includes('expander') || !requestedColumns.includes('test')) {
+    return [...requestedColumns];
+  }
+
+  const next = [...requestedColumns];
+  const statusIndex = next.indexOf('status');
+  const testIndex = next.indexOf('test');
+  const insertIndex = statusIndex >= 0 ? statusIndex + 1 : testIndex;
+  next.splice(insertIndex, 0, 'expander');
+  return next;
+}
+
 function normalizeState(
   input: ResultTableStateInput | undefined,
   columns: readonly ResultTableColumn[],
@@ -307,7 +430,9 @@ function normalizeState(
       ?.map((id) => (id.startsWith('scorer:') ? `grader:${id.slice('scorer:'.length)}` : id))
       .filter((id) => columnIds.has(id)) ?? [];
   const visibleColumnIds =
-    requestedColumns.length > 0 ? requestedColumns : defaultVisibleColumnIds(columns);
+    requestedColumns.length > 0
+      ? includeStructuralColumn(requestedColumns, columns)
+      : defaultVisibleColumnIds(columns);
   const target =
     input?.target && targetOptions.includes(input.target) ? input.target : DEFAULT_TARGET;
   const requestedGrader = input?.grader ?? input?.scorer;
@@ -370,10 +495,17 @@ export function buildResultTableModel(input: BuildResultTableModelInput): Result
     if (query && !row.searchText.includes(query)) return false;
     return true;
   });
+  const repeatGroups = rows
+    .map((row) => buildRepeatGroup(row, input.passThreshold))
+    .filter((group): group is RepeatRunGroup => Boolean(group));
+  const filteredRowKeys = new Set(filteredRows.map((row) => row.key));
+  const filteredRepeatGroups = repeatGroups.filter((group) => filteredRowKeys.has(group.row.key));
 
   return {
     rows,
     filteredRows,
+    repeatGroups,
+    filteredRepeatGroups,
     columns,
     visibleColumns: columns.filter((column) => visibleColumnIds.has(column.id)),
     state,
diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts
index d86ca3c9f..e790c8558 100644
--- a/apps/dashboard/src/lib/types.ts
+++ b/apps/dashboard/src/lib/types.ts
@@ -101,6 +101,49 @@ export interface AssertionEntry {
   durationMs?: number;
 }
 
+export interface EvalCaseTrial {
+  attempt?: number;
+  run_path?: string;
+  score?: number;
+  verdict?: string;
+  scores?: ScoreEntry[];
+  assertions?: AssertionEntry[];
+  error?: string;
+  execution_status?: string;
+  cost_usd?: number;
+  total_tokens?: number;
+  token_usage?: TokenUsage;
+  duration_ms?: number;
+  total_tool_calls?: number;
+  tool_calls?: Record<string, number>;
+  metrics_path?: string;
+  timing_path?: string;
+  grading_path?: string;
+  transcript_path?: string;
+  answer_path?: string;
+}
+
+export type EvalTrialAggregation =
+  | {
+      strategy: 'pass_at_k';
+      passed_attempts?: number;
+      total_attempts?: number;
+    }
+  | {
+      strategy: 'mean';
+      mean?: number;
+      min?: number;
+      max?: number;
+    }
+  | {
+      strategy: 'confidence_interval';
+      mean?: number;
+      ci95_lower?: number;
+      ci95_upper?: number;
+      stddev?: number;
+    }
+  | Record<string, unknown>;
+
 export interface SourceOmittedContent {
   reason: string;
   message?: string;
@@ -204,6 +247,16 @@ export interface EvalResult {
   externalTrace?: CamelExternalTraceMetadata;
   metadata?: Record<string, unknown>;
   source_traceability?: SourceTraceability;
+  trials?: EvalCaseTrial[];
+  aggregation?: EvalTrialAggregation;
+  artifact_dir?: string;
+  summary_path?: string;
+  grading_path?: string;
+  timing_path?: string;
+  metrics_path?: string;
+  transcript_path?: string;
+  output_path?: string;
+  answer_path?: string;
 }
 
 export interface RunDetailResponse {
@@ -216,7 +269,7 @@ export interface RunDetailResponse {
   status?: 'starting' | 'running' | 'finished' | 'failed';
   /** Path to the run workspace directory (relative to cwd when inside, otherwise absolute). Local runs only. */
   run_dir?: string;
-  /** Eval file path the run was launched against, if recorded in benchmark.json. Local runs only. */
+  /** Eval file path the run was launched against, if recorded in summary.json. Local runs only. */
   suite_filter?: string;
   /** Total (test_id, target) executions originally planned for this run. Used to detect incomplete partial runs as resumable. Local runs only, populated when the run was launched after the planned-count metadata feature shipped. */
   planned_test_count?: number;
diff --git a/apps/dashboard/src/routes/evals/$runId.$evalId.tsx b/apps/dashboard/src/routes/evals/$runId.$evalId.tsx
index c801dbd3c..89246b754 100644
--- a/apps/dashboard/src/routes/evals/$runId.$evalId.tsx
+++ b/apps/dashboard/src/routes/evals/$runId.$evalId.tsx
@@ -19,6 +19,10 @@ export const Route = createFileRoute('/evals/$runId/$evalId')({
 
 function EvalDetailPage() {
   const { runId, evalId } = Route.useParams();
+  const artifactDir =
+    typeof window === 'undefined'
+      ? undefined
+      : (new URLSearchParams(window.location.search).get('artifact_dir') ?? undefined);
   const { data, isLoading, error } = useRunDetail(runId);
   const { data: config } = useStudioConfig();
   const [showRunEval, setShowRunEval] = useState(false);
@@ -41,7 +45,9 @@ function EvalDetailPage() {
     );
   }
 
-  const result = data?.results.find((r) => r.testId === evalId);
+  const result = data?.results.find(
+    (r) => r.testId === evalId && (!artifactDir || r.artifact_dir === artifactDir),
+  );
 
   if (!result) {
     return (
diff --git a/apps/dashboard/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx b/apps/dashboard/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx
index 5d62014a4..60128f9da 100644
--- a/apps/dashboard/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx
+++ b/apps/dashboard/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx
@@ -15,6 +15,10 @@ export const Route = createFileRoute('/projects/$projectId_/evals/$runId/$evalId
 
 function ProjectEvalDetailPage() {
   const { projectId, runId, evalId } = Route.useParams();
+  const artifactDir =
+    typeof window === 'undefined'
+      ? undefined
+      : (new URLSearchParams(window.location.search).get('artifact_dir') ?? undefined);
   const { data, isLoading, error } = useProjectRunDetail(projectId, runId);
   const { data: config } = useStudioConfig(projectId);
   const [showRunEval, setShowRunEval] = useState(false);
@@ -37,7 +41,9 @@ function ProjectEvalDetailPage() {
     );
   }
 
-  const result = data?.results.find((r) => r.testId === evalId);
+  const result = data?.results.find(
+    (r) => r.testId === evalId && (!artifactDir || r.artifact_dir === artifactDir),
+  );
 
   if (!result) {
     return (
diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
index 5e6f6a63b..4dd133d96 100644
--- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
@@ -81,22 +81,25 @@ provenance:
 
 ```text
 <run-dir>/index.jsonl
-<run-dir>/benchmark.json
+<run-dir>/summary.json
 <run-dir>/<suite>/<case-id>/summary.json
-<run-dir>/<suite>/<case-id>/grading.json
 <run-dir>/<suite>/<case-id>/run-1/result.json
+<run-dir>/<suite>/<case-id>/run-1/grading.json
+<run-dir>/<suite>/<case-id>/run-1/metrics.json
+<run-dir>/<suite>/<case-id>/run-1/timing.json
 <run-dir>/<suite>/<case-id>/run-1/transcript.json
 <run-dir>/<suite>/<case-id>/run-1/transcript-raw.jsonl
 <run-dir>/<suite>/<case-id>/run-1/outputs/answer.md
-<run-dir>/<suite>/<case-id>/run-1/grading.json
 ```
 
 The repeated case aggregate folder uses `summary.json` for run-count, pass-rate,
 fingerprint, and flattened snake_case timing fields such as
-`mean_duration_ms`, and `grading.json` for compact trial/aggregation verdicts.
+`mean_duration_ms`.
 Each `run-N/result.json` is the per-attempt manifest and includes
-`grading_path`, transcript/output paths, and embedded timing/o11y metrics.
-Root `index.jsonl` and root `benchmark.json` remain stable for existing CI
+`grading_path`, transcript/output paths, and embedded timing/o11y metrics. Each
+attempt also keeps AgentV `grading.json`, `metrics.json`, and `timing.json`
+sidecars for detailed inspection.
+Root `index.jsonl` and root `summary.json` remain stable for existing CI
 summary scripts and uploaded artifact consumers.
 
 ## Targets and setup
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index 460a43f7b..166062f4d 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -80,7 +80,7 @@ Dry-run returns mock responses that don't match grader output schemas. Use it on
 
 ### Custom Output Directory
 
-Write all artifacts (index.jsonl, benchmark.json, per-test grading/timing) to a specific directory:
+Write all artifacts (index.jsonl, summary.json, per-test grading/timing) to a specific directory:
 
 ```bash
 agentv eval evals/my-eval.yaml --output ./my-results
@@ -110,12 +110,17 @@ Typical layout:
 ```text
 my-results/
   index.jsonl
-  benchmark.json
+  summary.json
   <test-id>/
-    grading.json
-    timing.json
-    task/PROMPT.md
-    outputs/answer.md
+    summary.json
+    run-1/
+      result.json
+      grading.json
+      metrics.json
+      timing.json
+      transcript.json
+      transcript-raw.jsonl
+      outputs/answer.md
     task/
       EVAL.yaml
       targets.yaml
@@ -133,9 +138,8 @@ tests while authoring an eval, but they are optional input organization rather
 than a separate artifact schema.
 
 If the source eval uses the `PROMPT.md` fallback instead of inline `input`,
-AgentV still writes the resolved task prompt to the generated
-`task/PROMPT.md`. This keeps the run artifact self-contained without requiring
-the same long prompt to be duplicated in both Markdown and YAML.
+AgentV records the generated task bundle metadata when source artifacts are
+available. It no longer emits a generated prompt sidecar for result rows.
 
 ### Manual or External-Agent Attempts
 
@@ -457,7 +461,7 @@ When automatic remote publishing sees pointers whose `ref` is
 `agentv/artifacts/v1` branch in the same results remote at
 `runs/<run-path>/<pointer.path>` and rewrites the published pointer `key` to
 that backend object key. The configured results branch is the metadata/control
-plane for `index.jsonl`, `benchmark.json`, tags, and pointers; it does not
+plane for `index.jsonl`, `summary.json`, tags, and pointers; it does not
 duplicate canonical trace/transcript payload bodies when those rows name
 `agentv/artifacts/v1`. Local pre-publish run workspaces can still contain the
 files beside the manifest, and Dashboard resolves the published pointers lazily
diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
index 157ac965b..0bea029db 100644
--- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
+++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
@@ -255,7 +255,7 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your
 | `evals.json` | `agentv eval evals.json` | Direct — no conversion needed |
 | `claude -p "prompt"` | `agentv eval evals.json --target claude` | Same eval, richer engine |
 | `grading.json` (read) | `<test-id>/grading.json` (write) | Same per-test schema, AgentV writes one grading file per test case |
-| `benchmark.json` (read) | `<output>/benchmark.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape |
+| `summary.json` (read) | `<output>/summary.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape |
 | n/a | `index.jsonl` (write) | AgentV-specific per-test manifest for filtering, retry, and replay workflows |
 | with-skill vs without-skill | `--target baseline --target candidate` | Structured comparison |
 | Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, code graders, etc. |
diff --git a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
index 3b4639677..83c59c047 100644
--- a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
+++ b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
@@ -107,11 +107,11 @@ The rest of the bundle follows the same pattern:
 
 ## Benchmark output
 
-Generate the run `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:
+Generate the run `summary.json` alongside the standard result JSONL. The `summary.json` is automatically written to the artifact directory:
 
 ```bash
 agentv eval evals.json --target claude --output ./results
-# benchmark.json is written to ./results/benchmark.json
+# summary.json is written to ./results/summary.json
 ```
 
 The benchmark uses AgentV's pass threshold (score >= 0.8) for each target's `pass_rate`, plus timing and token summaries:
@@ -132,7 +132,7 @@ The benchmark uses AgentV's pass threshold (score >= 0.8) for each target's `pas
 }
 ```
 
-If another tool needs a different benchmark shape, keep `--output` as the source of truth and convert `<output>/benchmark.json` in a wrapper.
+If another tool needs a different benchmark shape, keep `--output` as the source of truth and convert `<output>/summary.json` in a wrapper.
 
 ## Converting to EVAL.yaml
 
diff --git a/apps/web/src/content/docs/docs/tools/dashboard.mdx b/apps/web/src/content/docs/docs/tools/dashboard.mdx
index 22da74d25..4f81462ab 100644
--- a/apps/web/src/content/docs/docs/tools/dashboard.mdx
+++ b/apps/web/src/content/docs/docs/tools/dashboard.mdx
@@ -295,7 +295,7 @@ projects:
         push_conflict_policy: block
 ```
 
-`results.repo.remote` is the Git remote URL AgentV fetches and pushes. `results.repo.path: .` stores completed run artifacts on a dedicated branch of the source repository without checking out that branch in the source worktree. AgentV manages the local Git remote alias for that URL, so the normal config stays portable across machines. When `results.repo.remote` is omitted, `results.repo.path` means an existing local Git checkout whose object database and refs AgentV should write to, and the branch defaults to `agentv/results/v1`. AgentV creates the branch automatically on first publish and commits only AgentV result paths into it. `sync.auto_push: false` keeps the result commit local; set it to `true` to push the branch best-effort after each completed run. `sync.require_push: true` is for CI workflows where a push failure should fail the command after local artifacts are written. `sync.push_conflict_policy` defaults to `block`; the removed `backup_and_force_push` value is rejected with migration guidance because AgentV never force-pushes result branches. Non-fast-forward result branch pushes are auto-merged with artifact-aware Git merge drivers and pushed as a fast-forward, so the canonical results branch is never force-pushed or rewritten. Genuine overlay conflicts route to a timestamped temp branch plus a GitHub compare link for a human merge instead.
+`results.repo.remote` is the Git remote URL used when AgentV creates a fresh results checkout, and the intended remote URL for portable project config. `results.repo.path: .` stores completed run artifacts on a dedicated branch of the source repository without checking out that branch in the source worktree. AgentV does not add or rewrite remotes inside an existing checkout; the checkout's existing `origin` must already point at the repository you want to fetch and push. When `results.repo.remote` is omitted, `results.repo.path` means an existing local Git checkout whose object database and refs AgentV should write to, and the branch defaults to `agentv/results/v1`. AgentV creates the branch automatically on first publish and commits only AgentV result paths into it. `sync.auto_push: false` keeps the result commit local; set it to `true` to push the branch best-effort after each completed run. `sync.require_push: true` is for CI workflows where a push failure should fail the command after local artifacts are written. `sync.push_conflict_policy` defaults to `block`; the removed `backup_and_force_push` value is rejected with migration guidance because AgentV never force-pushes result branches. Non-fast-forward result branch pushes are auto-merged with artifact-aware Git merge drivers and pushed as a fast-forward, so the canonical results branch is never force-pushed or rewritten. Genuine overlay conflicts route to a timestamped temp branch plus a GitHub compare link for a human merge instead.
 
 For a separate results repository, use `results.repo.remote` and an optional managed clone `results.repo.path`:
 
@@ -315,7 +315,7 @@ projects:
         push_conflict_policy: block
 ```
 
-`results.repo.remote` is the Git remote URL used for clone and push operations, so use HTTPS when credentials are HTTP-token based and SSH when the runtime has SSH keys configured. When `results.repo.remote` is set, `results.repo.path` is the filesystem location of the local clone AgentV manages for that remote results repo. Omit `results.repo.remote` only when `results.repo.path` points at an already-existing local checkout such as `.`.
+`results.repo.remote` is the Git remote URL used for clone and push operations, so use HTTPS when credentials are HTTP-token based and SSH when the runtime has SSH keys configured. When `results.repo.remote` is set and `results.repo.path` is missing or empty, AgentV creates that filesystem location with `git clone`. If `results.repo.path` already points at a Git checkout, AgentV treats that checkout's remotes as user-owned state: it fetches and pushes using the existing configured remote name (`origin` by default), but it does not run `git remote add` or `git remote set-url`. Omit `results.repo.remote` only when `results.repo.path` points at an already-existing local checkout such as `.`.
 
 You can also set a top-level global fallback in the same file. This is used when the current project is not registered or its registry entry has no `results` block:
 
diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx
index ac371163c..600a57791 100644
--- a/apps/web/src/content/docs/docs/tools/results.mdx
+++ b/apps/web/src/content/docs/docs/tools/results.mdx
@@ -112,25 +112,23 @@ Duplicate policy is explicit:
 
 ### Metrics sidecar
 
-Each direct per-case artifact directory includes `metrics.json`
-(`schema_version: "agentv.metrics.v1"`). This is an AgentV-owned
-derived projection over `trace.json`, the result row, and
-`grading.json`. It is the compact executor behavior summary for dashboards,
-comparison exports, and metric-style graders; it is not canonical trace storage
-and does not carry token/cost usage.
-
-Repeat-enabled cases use aggregate `summary.json` with flattened snake_case
-timing fields plus aggregate `grading.json`, then store attempt details under
-`run-N/`. Each `run-N/` contains a compact per-attempt manifest
-`result.json`, `transcript.json`, `transcript-raw.jsonl`, and `outputs/answer.md`,
-plus AgentV `grading.json`. The `result.json` file carries `grading_path`,
-transcript/output paths, and embedded timing/o11y metrics; repeat attempts do
-not write a separate `metrics.json` sidecar.
-
-`transcript.jsonl` remains the ordered conversational/log compatibility
-projection. Full trace detail stays in `trace.json`
-(`agentv.trace.v1`). `benchmark.json` remains the run-level aggregate summary,
-and `index.jsonl` carries the lightweight `metrics_path` plus
+Each attempt directory includes `metrics.json`
+(`schema_version: "agentv.metrics.v1"`). This is an AgentV-owned derived
+projection over the attempt trace/transcript, result row, and `grading.json`.
+It is the compact executor behavior summary for dashboards, comparison exports,
+and metric-style graders; it is not canonical trace storage and does not carry
+token/cost usage.
+
+Every case uses aggregate `summary.json`, then stores attempt details under
+`run-N/`. Each `run-N/` contains a compact per-attempt manifest `result.json`,
+`grading.json`, `metrics.json`, `timing.json`, `transcript.json`,
+`transcript-raw.jsonl`, and `outputs/answer.md`. The `result.json` file carries
+`grading_path`, transcript/output paths, and embedded timing/o11y metrics.
+
+`transcript-raw.jsonl` remains the ordered conversational/log compatibility
+projection. Full trace detail stays in `trace.json` (`agentv.trace.v1`) when
+emitted. `summary.json` remains the run-level aggregate summary, and
+`index.jsonl` carries lightweight explicit paths such as `metrics_path` plus
 the trace/transcript artifact pointers used for detached payload publishing.
 Duration, token, and cost usage remains in `timing.json`, including source
 labels such as `provider_reported`, `token_estimated`, `aggregate`, or
@@ -151,7 +149,7 @@ Vercel `@vercel/agent-eval` `results.o11y` maps into AgentV like this:
 | `shellCommands` | `metrics.shell_commands` | `metrics.json` |
 | `filesRead` | `metrics.files_read` | `metrics.json` |
 | `filesModified` | `metrics.files_modified` | `metrics.json` |
-| `toolCalls` | `metrics.tool_call_events`, `metrics.tool_calls`, and `metrics.tool_call_counts` | `metrics.json`; compact counts can also appear in `benchmark.json.run_summary[*].tool_calls` |
+| `toolCalls` | `metrics.tool_call_events`, `metrics.tool_calls`, and `metrics.tool_call_counts` | `metrics.json`; compact counts can also appear in `summary.json.run_summary[*].tool_calls` |
 | `totalToolCalls` | `metrics.total_tool_calls` | `metrics.json` |
 | `webFetches` | `metrics.web_fetches` | `metrics.json` |
 | `totalTurns` | `metrics.total_turns` | `metrics.json`; conversational rows remain in `transcript.jsonl` |
@@ -163,13 +161,13 @@ Agent Skills eval artifacts map into AgentV like this:
 | Agent Skills pattern | AgentV field | Artifact location |
 |----------------------|--------------|-------------------|
 | Authored `evals/evals.json` cases | AgentV eval cases and task bundle paths | Eval source plus optional `task_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` |
-| Per-case answer | Generated target output artifact | `outputs/answer.md` |
-| Per-case sidecars | Trace, transcript, metrics, and raw provider evidence | `trace.json`, `transcript.jsonl`, `metrics.json`, `provider.log` |
-| Per-case `timing.json` | Duration, token totals, cost, and usage source labels | `timing.json` |
-| Per-case `grading.json` | Assertions, graders, execution metrics, workspace changes | `grading.json`; summary fields can reference the same trace/result facts |
-| Iteration-level `benchmark.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `benchmark.json` |
+| Per-case answer | Generated target output artifact | `run-N/outputs/answer.md` |
+| Per-attempt sidecars | Trace, transcript, metrics, and raw provider evidence | `run-N/transcript.json`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json`, `provider.log` when present |
+| Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `run-N/timing.json` |
+| Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `run-N/grading.json`; summary fields can reference the same trace/result facts |
+| Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` |
 | Transcript/log outlier analysis | Ordered transcript and canonical trace | `transcript.jsonl` for log compatibility; `trace.json` for full detail |
-| Aggregate pass rate/time/tokens/delta | Run summaries and comparison tooling | `benchmark.json`, result comparisons, and projection bundles |
+| Aggregate pass rate/time/tokens/delta | Run summaries and comparison tooling | `summary.json`, result comparisons, and projection bundles |
 
 ### Vendor-neutral projection bundle
 
@@ -239,7 +237,7 @@ The CLI contract is deliberately narrow: `agentv results` manages local result a
 
 Use these supported remote workflows instead:
 
-- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo.remote` with `repo.path: .` and `repo.branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo without requiring a machine-local Git remote name. AgentV reserves `agentv/results/v1` for primary results and `agentv/artifacts/v1` for heavy artifact payloads. When `index.jsonl` rows point trace or transcript payloads at `agentv/artifacts/v1`, automatic publishing stores those bytes on that artifact branch in the same remote and publishes pointer keys such as `runs/<run-path>/<pointer.path>`. The configured results branch remains the metadata/control plane (`index.jsonl`, `benchmark.json`, tags, and pointers) instead of duplicating canonical trace/transcript payload bodies. Local pre-publish run workspaces can still contain those files beside the manifest so local tools keep working. Mutable run tags are stored as `tags.json` with a `tag_revision`; there is no tag event log in the normal results layout. `results.repo.path` without `results.repo.remote` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. AgentV manages any local Git remote alias internally. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. Non-fast-forward result branch pushes never force-push: AgentV auto-merges concurrent remote writes with artifact-aware Git merge drivers (a union driver for the append-only `index.jsonl`, a JSON-union driver for tag and feedback overlays) and pushes the merge as a fast-forward, and routes a genuine overlay conflict to a timestamped `agentv/results-sync/...` branch plus a GitHub compare/PR link for a human merge. The removed `sync.push_conflict_policy: backup_and_force_push` value is rejected with migration guidance; remove the field or set it to `block`. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled.
+- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `repo.remote` with `repo.path: .` and `repo.branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo. AgentV never adds or rewrites remotes in an existing checkout; that checkout's `origin` must already point at the repository you want to fetch and push. AgentV reserves `agentv/results/v1` for primary results and `agentv/artifacts/v1` for heavy artifact payloads. When `index.jsonl` rows point trace or transcript payloads at `agentv/artifacts/v1`, automatic publishing stores those bytes on that artifact branch in the same remote and publishes pointer keys such as `runs/<run-path>/<pointer.path>`. The configured results branch remains the metadata/control plane (`index.jsonl`, `summary.json`, tags, and pointers) instead of duplicating canonical trace/transcript payload bodies. Local pre-publish run workspaces can still contain those files beside the manifest so local tools keep working. Mutable run tags are stored as `tags.json` with a `tag_revision`; there is no tag event log in the normal results layout. `results.repo.path` without `results.repo.remote` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `sync.auto_push: true` to push after publish, or `sync.require_push: true` in CI to fail when that push fails. Non-fast-forward result branch pushes never force-push: AgentV auto-merges concurrent remote writes with artifact-aware Git merge drivers (a union driver for the append-only `index.jsonl`, a JSON-union driver for tag and feedback overlays) and pushes the merge as a fast-forward, and routes a genuine overlay conflict to a timestamped `agentv/results-sync/...` branch plus a GitHub compare/PR link for a human merge. The removed `sync.push_conflict_policy: backup_and_force_push` value is rejected with migration guidance; remove the field or set it to `block`. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled.
 - **Manual Dashboard sync:** run `agentv dashboard`, open the project, and use **Sync Project**.
 - **Manual API sync:** while Dashboard is running, call `GET /api/projects/:projectId/remote/status` or `POST /api/projects/:projectId/remote/sync` for project-scoped automation. Single-project sessions also expose `GET /api/remote/status` and `POST /api/remote/sync`.
 - **Git escape hatch:** for advanced recovery, inspect or repair the configured `projects[].results.repo.path` clone with `git` directly, then sync again.
diff --git a/apps/web/src/content/docs/docs/tools/wip-checkpoints.mdx b/apps/web/src/content/docs/docs/tools/wip-checkpoints.mdx
index a6d2db05d..2eb389fa9 100644
--- a/apps/web/src/content/docs/docs/tools/wip-checkpoints.mdx
+++ b/apps/web/src/content/docs/docs/tools/wip-checkpoints.mdx
@@ -22,7 +22,7 @@ If no results repo is configured, or auto-push is disabled, `agentv eval` still
 
 | Location | Path or ref | What it contains |
 | --- | --- | --- |
-| Local project | `.agentv/results/<experiment>/<run-id>/benchmark.json` | A run-start stub with `metadata.planned_test_count` and the eval file path when known. This lets Dashboard recognize incomplete local runs as resumable. |
+| Local project | `.agentv/results/<experiment>/<run-id>/summary.json` | A run-start stub with `metadata.planned_test_count` and the eval file path when known. This lets Dashboard recognize incomplete local runs as resumable. |
 | Local project | `.agentv/results/<experiment>/<run-id>/index.jsonl` | Result rows appended as test cases finish. Rows use the normal snake_case result JSONL format. |
 | Results repo remote | `agentv/wip/<hostname>/<run-dir-basename>` | A forced-updated branch containing the checkpointed run under `.agentv/results/<same-relative-run-path>/`. |
 | Results repo storage branch | Configured `results.repo.branch`; local checkout configs default to `agentv/results/v1` | The final published run after `agentv eval` completes and the normal auto-export succeeds. |
@@ -31,7 +31,7 @@ The WIP branch name is derived from the current host and the run directory basen
 
 ## Lifecycle
 
-1. **Run start** — AgentV creates the local run directory and writes the initial `benchmark.json` stub. If auto-push is enabled, it creates a temporary git worktree for a branch named `agentv/wip/<hostname>/<run-dir-basename>`, based on the configured results storage branch. Missing storage branches are initialized automatically.
+1. **Run start** — AgentV creates the local run directory and writes the initial `summary.json` stub. If auto-push is enabled, it creates a temporary git worktree for a branch named `agentv/wip/<hostname>/<run-dir-basename>`, based on the configured results storage branch. Missing storage branches are initialized automatically.
 2. **While running** — about every 30 seconds, AgentV copies the current run directory into the WIP worktree, amends a single checkpoint commit, and force-pushes the WIP branch. If nothing changed, it skips the push.
 3. **Successful completion** — AgentV publishes the completed run to the normal results branch. After that publish is confirmed as `published` or `already_published`, it deletes the remote WIP branch.
 4. **Failure, interrupt, or final export failure** — AgentV stops the checkpoint loop and removes the temporary local worktree, but leaves the remote WIP branch intact for recovery.
@@ -55,7 +55,7 @@ git branch -r --list 'origin/agentv/wip/*'
 git switch --detach origin/agentv/wip/<hostname>/<run-dir-basename>
 
 # 4. Inspect the checkpointed run path.
-find .agentv/results -name benchmark.json
+find .agentv/results -name summary.json
 
 # 5. Copy the run tree into the eval project, preserving experiment paths.
 PROJECT=/path/to/eval-project
@@ -67,7 +67,7 @@ cd "$PROJECT"
 agentv eval <eval-file> --output .agentv/results/<experiment>/<run-id> --resume
 ```
 
-If the recovered `benchmark.json` contains `metadata.eval_file`, use that as `<eval-file>`.
+If the recovered `summary.json` contains `metadata.eval_file`, use that as `<eval-file>`.
 
 After the resumed run publishes successfully, AgentV cleans up any WIP branch it creates for the resumed run. Delete the original orphaned branch manually when you no longer need it:
 
@@ -77,13 +77,13 @@ git push origin --delete agentv/wip/<hostname>/<run-dir-basename>
 
 ## Dashboard and `results` surfaces
 
-- **Dashboard local runs:** an interrupted local run can show the one-click **Resume run** and **Rerun failed** actions when `benchmark.json` has `metadata.planned_test_count` greater than the number of result rows, or when any row has `execution_status: execution_error`.
+- **Dashboard local runs:** an interrupted local run can show the one-click **Resume run** and **Rerun failed** actions when `summary.json` has `metadata.planned_test_count` greater than the number of result rows, or when any row has `execution_status: execution_error`.
 - **Dashboard remote runs:** normal remote listing reads the configured results storage branch. It does not list `agentv/wip/...` WIP branches. Recover the checkpoint into the project-local run directory first, or wait for the final publish branch to receive a completed run.
 - **`agentv results` CLI:** the command family manages local run workspaces and reports. It does not have a WIP branch subcommand; use git for remote checkpoint inspection and cleanup.
 
 ## Operational caveats
 
-- The first remote checkpoint happens on the periodic interval, so a process that dies immediately after startup may only have the local `benchmark.json` stub.
+- The first remote checkpoint happens on the periodic interval, so a process that dies immediately after startup may only have the local `summary.json` stub.
 - The WIP branch is force-pushed and keeps one snapshot commit. Do not treat it as an audit log.
 - Checkpoint contents can include prompts, outputs, grader evidence, traces, and generated task bundles. Protect the results repo like any other eval artifact store.
 - Authentication and branch permissions are the same as normal results auto-push. If git or GitHub authentication is missing, AgentV warns and keeps evaluating locally.
diff --git a/docs/adr/2026-06-18-opik-post-run-export-boundary.md b/docs/adr/2026-06-18-opik-post-run-export-boundary.md
index e098f925c..263c0cd0f 100644
--- a/docs/adr/2026-06-18-opik-post-run-export-boundary.md
+++ b/docs/adr/2026-06-18-opik-post-run-export-boundary.md
@@ -86,7 +86,7 @@ The future Opik adapter should consume one of these equivalent inputs:
 1. `EvaluationResult[]` loaded from `index.jsonl` via `parseJsonlResults()`
 2. the completed run workspace with:
    - `index.jsonl`
-   - `benchmark.json`
+   - `summary.json`
    - per-test `grading.json`
    - per-test `timing.json`
    - per-test `outputs/trace.json`
diff --git a/docs/brainstorms/2026-06-05-sqlite-results-index-research.md b/docs/brainstorms/2026-06-05-sqlite-results-index-research.md
index 67a7cd7a6..b12627f05 100644
--- a/docs/brainstorms/2026-06-05-sqlite-results-index-research.md
+++ b/docs/brainstorms/2026-06-05-sqlite-results-index-research.md
@@ -16,7 +16,7 @@ The main benefit of SQLite is not just faster run-list rendering. It is high-vol
 
 - GitHub issue `#1259` is the main prior scaling issue. It identified `/api/runs` polling as O(N runs x manifest reads) and originally proposed an append-only run index.
 - PR `#1260` implemented append-only `index/runs.jsonl`, then was closed/rejected because it introduced drift, migration, growth, and commit-SHA amend complexity.
-- PR `#1261` merged the current git-native approach: `git ls-tree` plus `git cat-file --batch` over committed `benchmark.json` files, cursor pagination, lazy run materialization, and an `Agentv-Run:` commit trailer.
+- PR `#1261` merged the current git-native approach: `git ls-tree` plus `git cat-file --batch` over committed `summary.json` files, cursor pagination, lazy run materialization, and an `Agentv-Run:` commit trailer.
 - PRs `#994`, `#1258`, `#1296`, and `#1297` cover remote result sync and per-project result repo configuration.
 - PR `#741` moved canonical result consumers to run workspaces with `index.jsonl`; PR `#940` removed legacy flat manifest loading from canonical flows.
 - PR `#1040` added mutable local `tags.json` sidecars for per-run comparison; remote runs remain read-only.
@@ -26,7 +26,7 @@ The main benefit of SQLite is not just faster run-list rendering. It is high-vol
 
 ## Current Code Grounding
 
-- Remote result repo listing is in `packages/core/src/evaluation/results-repo.ts`. `listGitRuns()` reads remote `benchmark.json` blobs from `origin/main`.
+- Remote result repo listing is in `packages/core/src/evaluation/results-repo.ts`. `listGitRuns()` reads remote `summary.json` blobs from `origin/main`.
 - Dashboard result merge/list logic is in `apps/cli/src/commands/results/remote.ts`. It merges local run scans with remote git-native listing and uses a 60 second in-memory TTL cache.
 - Dashboard handlers in `apps/cli/src/commands/results/serve.ts` still enrich many views by loading `index.jsonl` per run after listing. This affects run lists, experiments, targets, compare, and analytics.
 - Local run discovery still uses `listResultFilesFromRunsDir()` in `apps/cli/src/commands/inspect/utils.ts`, which recursively scans `.agentv/results/runs/`.
@@ -46,7 +46,7 @@ This should be avoided. A committed SQLite database is binary, hard to review, p
 
 ### 3. Git-Native Only, Further Optimized
 
-Keep SQLite out for now and extend the current git-native path to batch-read remote `index.jsonl` blobs in addition to `benchmark.json`. This may be enough if profiling shows `git cat-file --batch` remains fast at the target scale.
+Keep SQLite out for now and extend the current git-native path to batch-read remote `index.jsonl` blobs in addition to `summary.json`. This may be enough if profiling shows `git cat-file --batch` remains fast at the target scale.
 
 ### 4. Append-Only JSONL Index
 
@@ -57,14 +57,14 @@ Already tried in PR `#1260` and rejected.
 Build SQLite as a local, disposable projection:
 
 - `results_index_meta(key, value)`
-- `runs(project_id, source, run_id, manifest_path, benchmark_path, ref, benchmark_blob_sha, manifest_blob_sha, experiment, target, timestamp, test_count, pass_rate, avg_score, size_bytes, updated_at)`
+- `runs(project_id, source, run_id, manifest_path, summary_path, ref, benchmark_blob_sha, manifest_blob_sha, experiment, target, timestamp, test_count, pass_rate, avg_score, size_bytes, updated_at)`
 - `run_tests(project_id, run_id, test_id, suite, category, target, score, execution_status, duration_ms, cost_usd, token_usage_json, scores_json)`
 - Optional later: `run_scores(project_id, run_id, test_id, grader_name, grader_type, score, verdict, duration_ms)` if drift needs grader-level breakdowns instead of only top-level test scores.
 
 Sync behavior:
 
 1. On Dashboard startup and `POST /api/remote/sync`, fetch remote results repos.
-2. Use `git ls-tree` with blob SHAs to detect changed remote `benchmark.json` and `index.jsonl` files.
+2. Use `git ls-tree` with blob SHAs to detect changed remote `summary.json` and `index.jsonl` files.
 3. Batch-read only changed blobs with `git cat-file --batch`.
 4. Upsert run and test summary rows.
 5. For local runs, scan `.agentv/results/runs/`, fingerprint `index.jsonl` by mtime/size or hash, and upsert changed rows.
diff --git a/docs/brainstorms/2026-06-08-eval-result-traceability-requirements.md b/docs/brainstorms/2026-06-08-eval-result-traceability-requirements.md
index 1220263c5..cb676197c 100644
--- a/docs/brainstorms/2026-06-08-eval-result-traceability-requirements.md
+++ b/docs/brainstorms/2026-06-08-eval-result-traceability-requirements.md
@@ -15,7 +15,7 @@ AgentV Dashboard result detail should make an eval result traceable back to the
 
 The current result artifacts explain what happened in a run, but not enough about where the definition came from. A Dashboard user can see output, score, assertions, and artifact files, yet must manually leave the run, find the source repository, locate the eval YAML, identify the right `test_id`, and resolve any file-backed inputs or grader prompts.
 
-WTG.AI.Prompts PR #679 exposed the gap. The final eval run for `evals/cargowise/database/data-transformation-pr50857-e2e.eval.yaml` passed, and the run artifact includes `test_id`, grader scores, `input.md`, `response.md`, `grading.json`, and `benchmark.json.metadata.eval_file`. It does not provide a compact Dashboard path from a result row to the source YAML test block, the structured `type: file` snippets, or the grader definitions that produced the score.
+WTG.AI.Prompts PR #679 exposed the gap. The final eval run for `evals/cargowise/database/data-transformation-pr50857-e2e.eval.yaml` passed, and the run artifact includes `test_id`, grader scores, `input.md`, `response.md`, `grading.json`, and `summary.json.metadata.eval_file`. It does not provide a compact Dashboard path from a result row to the source YAML test block, the structured `type: file` snippets, or the grader definitions that produced the score.
 
 The goal is not a full provenance system. AgentV should keep the core local-first and git-friendly: capture the eval-source facts available at run time, persist them in reviewable artifacts, and let Dashboard render them.
 
@@ -54,7 +54,7 @@ The goal is not a full provenance system. AgentV should keep the core local-firs
 
 **Compatibility And Safety**
 
-- R14. Existing `index.jsonl`, `benchmark.json`, `grading.json`, `input.md`, `response.md`, and `transcript.jsonl` consumers must keep working unchanged.
+- R14. Existing `index.jsonl`, `summary.json`, `grading.json`, `input.md`, `response.md`, and `transcript.jsonl` consumers must keep working unchanged.
 - R15. New artifact fields must be optional and backward compatible for historical runs.
 - R16. The artifact must avoid capturing environment variables, provider credentials, or workspace-local machine secrets.
 - R17. Large referenced files should be bounded by size limits with explicit truncation metadata; v1 may skip oversized content if it records path, hash when available, size, and reason.
diff --git a/docs/plans/1222-stop-run.md b/docs/plans/1222-stop-run.md
index 3c92ac232..bd470bf4e 100644
--- a/docs/plans/1222-stop-run.md
+++ b/docs/plans/1222-stop-run.md
@@ -20,7 +20,7 @@ AbortSignal threading.
    (not red). On click, optimistic local "Stopping…" label until the
    next status poll flips to terminal.
 4. **Resume detection for partial runs** — Persist `planned_test_count` in
-   `benchmark.json.metadata` at run start (early write), updated at end.
+   `summary.json.metadata` at run start (early write), updated at end.
    Run-detail API surfaces the number; Studio computes
    `shouldShowResumeActions(results, isReadOnly, plannedTestCount?)` as
    `executionError OR results.length < plannedTestCount`. No
@@ -63,9 +63,9 @@ AbortSignal threading.
 
 ### Resume — planned_test_count
 - `apps/cli/src/commands/eval/artifact-writer.ts`
-  - Extend `BenchmarkArtifact.metadata` with optional
+  - Extend `RunSummaryArtifact.metadata` with optional
     `planned_test_count?: number`.
-  - Add `writeInitialBenchmarkArtifact(runDir, { evalFile, targets,
+  - Add `writeInitialRunSummaryArtifact(runDir, { evalFile, targets,
     plannedTestCount, experiment })` that writes a stub at run start
     (`run_summary: {}`, `metadata` pre-filled).
 - `apps/cli/src/commands/eval/run-eval.ts` — call the initial writer
@@ -116,7 +116,7 @@ AbortSignal threading.
 
 ## Out-of-scope cleanups noted
 
-The benchmark.json write happens entirely at end-of-run today. Writing a
+The summary.json write happens entirely at end-of-run today. Writing a
 stub at start means a run that crashes before the first test still has a
 metadata file on disk — that may obsolete some of the fallback logic in
 `deriveResumeMeta`. We are not consolidating that here; this PR only adds
diff --git a/docs/plans/2026-06-09-eval-output-surface.md b/docs/plans/2026-06-09-eval-output-surface.md
index 99f3ae805..b2f10e0ca 100644
--- a/docs/plans/2026-06-09-eval-output-surface.md
+++ b/docs/plans/2026-06-09-eval-output-surface.md
@@ -7,13 +7,13 @@ Bead: `av-eval-output-config-surface-4e2`
 
 The eval run command currently exposes several overlapping ways to choose where results go:
 
-- `--output <dir>` / `-o <dir>` is the canonical run artifact directory. It writes `index.jsonl`, `benchmark.json`, `timing.json`, run source metadata, and per-test artifacts under that directory.
+- `--output <dir>` / `-o <dir>` is the canonical run artifact directory. It writes `index.jsonl`, `summary.json`, `timing.json`, run source metadata, and per-test artifacts under that directory.
 - `agentv.config.ts` `output.dir` exists, but current CLI normalization routes it through the legacy `outPath` branch, so it behaves like a file path rather than the documented output directory.
 - `agentv.config.ts` `output.format` is accepted by `defineConfig()` but eval runs ignore it.
 - `--out <path>` is deprecated and currently treated as a file path whose dirname becomes the artifact directory.
 - `--artifacts <dir>` is deprecated and currently aliases the artifact directory.
 - `--output-format` is deprecated and ignored because run directories always use `index.jsonl`.
-- `--benchmark-json` was a deprecated extra Agent Skills compatibility output path outside this cleanup's requested removal set; a follow-up cleanup removes that flag and keeps the run directory `benchmark.json` as canonical.
+- `--benchmark-json` was a deprecated extra Agent Skills compatibility output path outside this cleanup's requested removal set; a follow-up cleanup removes that flag and keeps the run directory `summary.json` as canonical.
 - Dashboard launch paths already pass `--output <dir>` and expect `<dir>/index.jsonl`.
 - Repository docs/examples still contain old `agentv eval --out <file>` guidance in compare workflows, grader-score helper comments, and local scripts.
 
@@ -49,7 +49,7 @@ Removed now:
 
 Warned/scheduled:
 
-- `--benchmark-json` is removed by the follow-up cleanup after auditing for consumers; use `--output <dir>` and read `<dir>/benchmark.json` instead of requesting a second benchmark file.
+- `--benchmark-json` is removed by the follow-up cleanup after auditing for consumers; use `--output <dir>` and read `<dir>/summary.json` instead of requesting a second benchmark file.
 
 ## Migration
 
diff --git a/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md
index 8c11522af..84e07b1f2 100644
--- a/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md
+++ b/docs/plans/2026-06-21-001-feat-av-quf-results-storage-plan.md
@@ -16,7 +16,7 @@ one artifact sidecar namespace, retention and compaction rules, a compact
 publication export, an append-only mutable-operation log, and an S3-compatible
 object-storage tier.
 
-The canonical AgentV run artifacts stay `benchmark.json`, `index.jsonl`, per-test
+The canonical AgentV run artifacts stay `summary.json`, `index.jsonl`, per-test
 grading/timing files, `outputs/trace.json`, and derived transcript artifacts.
 GitHub and Backblaze B2 are storage/publication targets over those artifacts.
 Dashboard and Hugging Face are viewers or publication surfaces. Phoenix is only
@@ -29,7 +29,7 @@ emitted spans; it is not an AgentV artifact projection or storage backend.
 
 `packages/core/src/evaluation/results-repo.ts` already implements the first git-native
 slice: `agentv/results/v1` is the default results branch, `runs/**` is listed with
-`git ls-tree`, `benchmark.json` blobs are read with `git cat-file --batch`, and the
+`git ls-tree`, `summary.json` blobs are read with `git cat-file --batch`, and the
 branch root is a deterministic orphan genesis. Current mutable tags live under
 `metadata/runs/**`, and heavy transcript sidecars are still written inside each run
 workspace by `packages/core/src/evaluation/run-artifacts.ts`.
@@ -49,7 +49,7 @@ without creating another hosted results platform inside AgentV.
 - Pin the git-native ref and path layout for `agentv/results/v1`,
   `agentv/artifacts/v1`, and `agentv/oplog/v1`.
 - Define retention, compaction, and migration rules for run metadata and heavy artifacts.
-- Define compact publication export as a derived artifact over `benchmark.json` and
+- Define compact publication export as a derived artifact over `summary.json` and
   `index.jsonl`, with no required `eval.txt`.
 - Define the mutable operation log and add-wins tag set semantics.
 - Define the Backblaze B2 S3-compatible object tier and secret-loading boundary.
@@ -106,7 +106,7 @@ without creating another hosted results platform inside AgentV.
   artifact payloads without deleting index metadata prematurely.
 - R12. Transcript migration must support transcripts under
   `agentv/artifacts/v1` while preserving existing logical artifact references.
-- R13. Publication export must be compact and derived from `benchmark.json` plus
+- R13. Publication export must be compact and derived from `summary.json` plus
   `index.jsonl`; it must not require an authored or generated `eval.txt`.
 
 ### Mutable Operations
@@ -129,14 +129,14 @@ without creating another hosted results platform inside AgentV.
 ## Key Technical Decisions
 
 - KTD1. Backend mode is a storage concern, not a product model. Use `git-native`,
-  `hybrid`, and `blob-native` as storage modes while keeping `benchmark.json` and
+  `hybrid`, and `blob-native` as storage modes while keeping `summary.json` and
   `index.jsonl` as the artifact contract that readers consume.
 - KTD2. Do not overload the existing `results.mode: github` field. Add
   `results.storage_mode` with values `git-native`, `hybrid`, and `blob-native`, and
   normalize missing `storage_mode` to `git-native`. Put object-store settings under
   `results.object_store`.
 - KTD3. The git tree remains the index for git-backed modes. `listGitRuns()` should
-  continue to list `runs/**/benchmark.json` from `agentv/results/v1`; no separate
+  continue to list `runs/**/summary.json` from `agentv/results/v1`; no separate
   branch-local `index/runs.jsonl` is introduced.
 - KTD4. Use one artifact sidecar namespace named `artifacts`. Do not introduce
   `artifact-blobs`, `blobs`, or per-artifact refs. Prefix by artifact class, for example
@@ -187,7 +187,7 @@ flowchart TB
 
 | Mode | Canonical index/listing | Artifact payloads | Mutable ops | Git dependency |
 | --- | --- | --- | --- | --- |
-| `git-native` | `git ls-tree -r agentv/results/v1 -- runs/` plus `git cat-file --batch` for `benchmark.json` | `agentv/artifacts/v1` stores payload bytes | `agentv/oplog/v1` | Required |
+| `git-native` | `git ls-tree -r agentv/results/v1 -- runs/` plus `git cat-file --batch` for `summary.json` | `agentv/artifacts/v1` stores payload bytes | `agentv/oplog/v1` | Required |
 | `hybrid` | Same primary git ref as `git-native` | Object storage stores selected payload bytes; git stores locators under the artifact namespace | `agentv/oplog/v1` | Required for index/oplog |
 | `blob-native` | Bucket manifest under the results namespace, with `ListObjectsV2` fallback by prefix | Object storage stores all payloads | Bucket oplog prefix | None |
 
@@ -195,7 +195,7 @@ flowchart TB
 
 ```text
 agentv/results/v1
-  runs/<experiment>/<timestamp>/benchmark.json
+  runs/<experiment>/<timestamp>/summary.json
   runs/<experiment>/<timestamp>/index.jsonl
   runs/<experiment>/<timestamp>/<test-artifacts except moved heavy payloads>
   metadata/runs/<experiment>/<timestamp>/materialized-tags.json
@@ -254,15 +254,15 @@ rewriting all results code at once.
 
 **Per-mode listing/index strategy:**
 
-- `git-native`: list `runs/**/benchmark.json` with `git ls-tree`; batch-read
+- `git-native`: list `runs/**/summary.json` with `git ls-tree`; batch-read
   benchmark blobs with `git cat-file --batch`; materialize run details lazily with
   `materializeGitRun()`.
-- `hybrid`: list from the same git ref and read the same `benchmark.json` blobs.
+- `hybrid`: list from the same git ref and read the same `summary.json` blobs.
   Artifact locators in `index.jsonl` or sidecar manifests decide whether bytes come
   from git artifacts or object storage.
 - `blob-native`: read a compact run manifest from bucket storage first. If the
   manifest is missing or stale, fall back to `ListObjectsV2` over
-  `runs/**/benchmark.json`-equivalent objects, rebuild the manifest, and continue.
+  `runs/**/summary.json`-equivalent objects, rebuild the manifest, and continue.
   Use continuation tokens because S3 listing returns a bounded page per request.
 
 **Test plan:**
@@ -274,7 +274,7 @@ rewriting all results code at once.
   - Proves the adapter interface can list runs in all modes from fixtures.
 - `packages/core/test/evaluation/results-repo.test.ts`
   - Existing git-native tests must keep passing.
-  - Add coverage that `git-native` listing remains one `runs/**/benchmark.json`
+  - Add coverage that `git-native` listing remains one `runs/**/summary.json`
     tree scan, not a committed index file.
 - `apps/cli/test/commands/results/serve.test.ts`
   - Dashboard `/api/runs` response shape stays stable across adapter-backed sources.
@@ -319,7 +319,7 @@ ref. Do not add windowed or per-run branches. Do not shard paths before measurem
 
 - Primary ref `agentv/results/v1`:
   - Owns `runs/**` and lightweight materialized metadata.
-  - Lists runs only through `runs/**/benchmark.json`.
+  - Lists runs only through `runs/**/summary.json`.
 - Artifact ref `agentv/artifacts/v1`:
   - Owns payload classes under `transcripts/`, `raw-logs/`, and `screenshots/`.
   - May store payload bytes in `git-native`.
@@ -452,7 +452,7 @@ for rerun, comparison, grading, or adapter ingestion.
   - `apps/cli/src/commands/results/publication.ts`
   - `packages/core/src/evaluation/results-publication.ts`
 - `packages/core/src/evaluation/run-artifacts.ts`
-  - Remains the source for `benchmark.json`, `index.jsonl`, and per-test artifact
+  - Remains the source for `summary.json`, `index.jsonl`, and per-test artifact
     schemas.
 - `apps/web/src/content/docs/docs/tools/results.mdx`
   - Document that publication export reads completed run artifacts and does not
@@ -463,10 +463,10 @@ for rerun, comparison, grading, or adapter ingestion.
 - Inputs:
   - completed run workspace;
   - `index.jsonl` manifest;
-  - `benchmark.json`;
+  - `summary.json`;
   - optional sidecar-resolved artifact references for selected public payloads.
 - Outputs:
-  - compact `benchmark.json` and `index.jsonl` or a derived `publication.json`;
+  - compact `summary.json` and `index.jsonl` or a derived `publication.json`;
   - optional static assets for selected summaries;
   - no required `eval.txt`.
 - Privacy:
@@ -476,7 +476,7 @@ for rerun, comparison, grading, or adapter ingestion.
 **Test plan:**
 
 - `apps/cli/test/commands/results/export.test.ts`
-  - Publication export succeeds with only `benchmark.json` and `index.jsonl`.
+  - Publication export succeeds with only `summary.json` and `index.jsonl`.
   - Publication export fails clearly when the manifest is not an AgentV result row.
   - Payload opt-in includes only selected sidecar files.
 - `apps/cli/test/commands/results/report.test.ts`
@@ -615,7 +615,7 @@ results:
   - BWS injects or exports the S3 endpoint, region, bucket, access key id, and secret
     access key into environment variables before AgentV runs.
   - AgentV config interpolates variable names or reads environment variables directly.
-- Never persist resolved BWS values into `benchmark.json`, `index.jsonl`, oplog records,
+- Never persist resolved BWS values into `summary.json`, `index.jsonl`, oplog records,
   Dashboard responses, docs examples, or project registry files.
 
 **Test plan:**
@@ -740,14 +740,14 @@ results:
   `apps/web/src/content/docs/docs/tools/results.mdx`,
   `apps/cli/test/commands/results/export.test.ts`
 - **Approach:** Keep publication export read-only over completed run artifacts. Use
-  `parseJsonlResults()` and `benchmark.json` metadata as inputs. If a new command is
+  `parseJsonlResults()` and `summary.json` metadata as inputs. If a new command is
   clearer than another export option, keep it under `agentv results` but document it as
   projection-only.
 - **Patterns to follow:** `loadExportSource()` and `deriveOutputDir()` in
   `apps/cli/src/commands/results/export.ts`; `results report` docs for static output
   framing.
 - **Test scenarios:**
-  - Given a run with `index.jsonl` and `benchmark.json`, publication export succeeds
+  - Given a run with `index.jsonl` and `summary.json`, publication export succeeds
     with no `eval.txt`.
   - Given an invalid JSONL input that is not an AgentV result row, publication export
     fails with the existing result-row schema guidance.
@@ -873,7 +873,7 @@ results:
 - `packages/core/src/evaluation/results-repo.ts` for deterministic genesis,
   `directPushResults()`, `listGitRuns()`, and `materializeGitRun()`.
 - `packages/core/src/evaluation/run-artifacts.ts` and
-  `apps/cli/src/commands/eval/artifact-writer.ts` for `benchmark.json`,
+  `apps/cli/src/commands/eval/artifact-writer.ts` for `summary.json`,
   `index.jsonl`, `outputs/trace.json`, and transcript sidecars.
 - `apps/cli/src/commands/results/remote.ts`,
   `apps/cli/src/commands/results/remote-metadata.ts`,
diff --git a/docs/plans/2026-06-23-001-feat-repeat-runs-flaky-evals-plan.md b/docs/plans/2026-06-23-001-feat-repeat-runs-flaky-evals-plan.md
index 8aafcc5a6..17769b29b 100644
--- a/docs/plans/2026-06-23-001-feat-repeat-runs-flaky-evals-plan.md
+++ b/docs/plans/2026-06-23-001-feat-repeat-runs-flaky-evals-plan.md
@@ -100,7 +100,7 @@ Before introducing AgentV-specific contract shapes, implementation should check
 | Reference | Lowest common denominator to reuse | Intentional AgentV divergence |
 | --- | --- | --- |
 | Claude Skills schema | Use assertion, expectation, grading, `passed`, `failed`, `total`, and assertion-level `pass_rate` vocabulary for graders that expose assertion counts. | Do not copy the full skill-eval artifact shape. AgentV keeps `.agentv/results/<experiment>/<timestamp>/...` as the portable run bundle and uses `attempt_success_rate` for repeat-run reliability. |
-| Vercel `agent-eval` | Reuse fixture-driven hidden verifier ergonomics, case-level `summary.json`, and durable `run-1`, `run-2` attempt directories. | Keep AgentV root `benchmark.json` for current run-level compatibility, but do not write per-attempt `benchmark.json`. Rename Vercel `passRate` to `attempt_success_rate` where attempt-frequency stats are exposed in AgentV-specific artifacts. Do not inherit ambiguous CLI gating semantics. |
+| Vercel `agent-eval` | Reuse fixture-driven hidden verifier ergonomics, root `summary.json`, root `index.jsonl`, case-level `summary.json`, and durable `run-1`, `run-2` attempt directories. | Do not write the retired root benchmark file or a compatibility alias. Keep AgentV attempt-local `grading.json`, `metrics.json`, and `timing.json` as value-add sidecars. Rename Vercel `passRate` to `attempt_success_rate` where attempt-frequency stats are exposed in AgentV-specific artifacts. Do not inherit ambiguous CLI gating semantics. |
 | Hugging Face Datasets | Keep dataset, split, record, features, and row-oriented corpus vocabulary for eval inputs and benchmark corpora. Treat an AgentV case as a record-like unit when mapping to external datasets. | Do not require Arrow, the Hub, DatasetDict, or HF storage layout. AgentV cases remain repo files or generated case records inside benchmark/project artifacts. |
 | OpenInference | Preserve trace/span/tool-call/model-observability semantics when naming trace metadata and external trace correlation fields. | Do not require OpenTelemetry collection, Phoenix, or OpenInference export as core runtime infrastructure. AgentV stores portable traces/transcripts as artifacts and supports link-out correlation through `external_trace` metadata. |
 
@@ -150,9 +150,9 @@ Public docs and implementation notes must not reference non-public sources. If a
 
 - KTD1. The repeat config attaches to the **experiment** surface, not to `eval.yaml` `execution`, per the experiments-separation decision (epic `av-991`, recorded on `av-991.1`). This aligns with Vercel agent-eval, where `runs`/`earlyExit` are experiment-level. This epic (`av-i0l`) owns the repeat **mechanics** (schema shape, gate policies, attempt aggregation, flake classification, and the run-N artifact layout); `av-991` owns **placement** (the experiment contract the repeat block lives on). The existing `execution.trials` code path is **hard-removed** (no compatibility alias) because usage is rare; its behavior is replaced by the experiment-level repeat block. Because the experiment surface is delivered by `av-991`, the schema work in `av-i0l.1` depends on that contract landing.
 - KTD2. Keep one-run CI as the default. Repeat runs are for reliability evidence unless `repeat.gate` says they are a CI gate.
-- KTD3. Store aggregate rows in the top-level `index.jsonl`, not one row per attempt. Attempt details live in case-local `summary.json`, `grading.json`, and `run-N/` directories so existing aggregate consumers do not inflate case counts.
-- KTD4. Single-run cases keep direct case-local files instead of always nesting under `run-1`. This preserves the simple default artifact shape and makes `.agentv/results/<experiment>/<timestamp>/<case-id>/grading.json` easy to inspect. Repeat-enabled cases use `run-1/`, `run-2/`, and so on under the case directory.
-- KTD5. Root run aggregates keep the existing AgentV `benchmark.json` for compatibility. Repeat case aggregates use `summary.json` with flattened snake_case timing fields plus AgentV aggregate `grading.json`; repeat attempts use `run-N/` children.
+- KTD3. Store aggregate rows in the top-level `index.jsonl`, not one row per attempt. Attempt details live in case-local `summary.json` plus `run-N/` directories so existing aggregate consumers do not inflate case counts.
+- KTD4. Single-run cases also nest the lone attempt under `run-1`, matching Vercel's durable attempt directory shape and keeping future repeat expansion append-only.
+- KTD5. Root run aggregates use `summary.json`, which supersedes the old AgentV benchmark file; no compatibility alias is written. Root `index.jsonl` is the discovery anchor for both local and git-backed remote discovery.
 - KTD6. `pass_at_k` keeps the existing AgentV/Vercel ergonomics: early exit is enabled unless explicitly disabled. Full reliability sampling requires `early_exit: false` on the experiment and should be recorded because it changes cost and statistics.
 - KTD7. Do not inherit Vercel's implicit CI ambiguity. All policies that can make one failed plus one passed attempt count as passing must be visible in config and artifacts.
 - KTD8. Reuse current failure classification fields before adding new enums. Add aggregate classification fields only after mapping from `execution_status`, `failure_stage`, and `failure_reason_code` proves insufficient.
@@ -239,20 +239,22 @@ This design assumes the in-flight artifact layout migration moves local run bund
 
 ### Single-Run Case
 
-Single-run cases keep direct case-local files:
+Single-run cases use the same case summary plus attempt directory layout as repeated cases:
 
 ```text
 .agentv/results/<experiment>/<timestamp>/index.jsonl
-.agentv/results/<experiment>/<timestamp>/benchmark.json
-.agentv/results/<experiment>/<timestamp>/<case-id>/grading.json
-.agentv/results/<experiment>/<timestamp>/<case-id>/timing.json
-.agentv/results/<experiment>/<timestamp>/<case-id>/task/PROMPT.md
-.agentv/results/<experiment>/<timestamp>/<case-id>/outputs/trace.json
-.agentv/results/<experiment>/<timestamp>/<case-id>/outputs/transcript.jsonl
-.agentv/results/<experiment>/<timestamp>/<case-id>/outputs/answer.md
+.agentv/results/<experiment>/<timestamp>/summary.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/summary.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/result.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/grading.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/metrics.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/timing.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/transcript.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/transcript-raw.jsonl
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/outputs/answer.md
 ```
 
-Rationale: the common path stays readable, old mental models stay close, and no user pays a `run-1/` nesting tax for default CI.
+Rationale: the common path now matches Vercel's repeat-ready attempt layout. AgentV keeps attempt-local grading, metrics, timing, transcript, and output sidecars; it does not emit generated prompt sidecars.
 
 ### Repeat-Run Case
 
@@ -263,11 +265,15 @@ Repeat-run cases use attempt directories:
 .agentv/results/<experiment>/<timestamp>/<case-id>/grading.json
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-1/result.json
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-1/grading.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/metrics.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-1/timing.json
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-1/transcript.json
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-1/transcript-raw.jsonl
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-1/outputs/answer.md
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-2/result.json
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-2/grading.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-2/metrics.json
+.agentv/results/<experiment>/<timestamp>/<case-id>/run-2/timing.json
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-2/transcript.json
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-2/transcript-raw.jsonl
 .agentv/results/<experiment>/<timestamp>/<case-id>/run-2/outputs/answer.md
@@ -275,8 +281,9 @@ Repeat-run cases use attempt directories:
 
 Each `run-N/result.json` is the per-attempt manifest. It carries paths such as
 `grading_path`, `transcript_path`, `transcript_raw_path`, `output_paths.answer`,
-plus embedded timing/o11y metrics so repeat attempts do not need a separate
-`metrics.json` sidecar.
+plus embedded timing/o11y metrics. AgentV also writes attempt-local
+`grading.json`, `metrics.json`, and `timing.json` sidecars for inspection,
+dashboard rendering, and compatibility with Agent Skills-style evidence.
 
 `<case-id>` should reuse the sanitized artifact key produced by the current artifact writer, including suite disambiguation where needed. Attempt directories are one-indexed because users naturally inspect `run-1`, `run-2`, and this matches the Vercel comparison.
 
@@ -312,7 +319,7 @@ Threshold behavior:
 - `mean_score_at_least` operates on `mean_score` and should be included in v1 only if the implementation can reuse existing numeric score semantics without widening grader contracts.
 - Execution errors do not silently become quality failures. They are counted in `execution_error_attempts` and affect gates according to the selected policy.
 
-Benchmark reporting should add target and suite aggregate reliability stats where they can be computed from case summaries. It should not replace `benchmark.json` with a new database or flatten attempts into extra top-level tests.
+Benchmark reporting should add target and suite aggregate reliability stats where they can be computed from case summaries. It should not replace root `summary.json` with a new database, reintroduce the retired benchmark file, or flatten attempts into extra top-level tests.
 
 ---
 
@@ -376,7 +383,7 @@ Dashboard should present repeat-run cases aggregate-first:
 - Attempt drill-down lists `run-1`, `run-2`, and so on with score, status, duration, cost, failure reason, and retry/exclusion reason.
 - Selecting an attempt opens the same Checks, Transcript, Source, Files, and Feedback affordances as a normal single-run result.
 - Dashboard must not hide individual traces, transcripts, raw provider logs, or grader output behind the aggregate.
-- Historical single-run rows render as they do today, with `summary.json` absent.
+- Historical direct-sidecar single-run rows render as they do today, even when case `summary.json` or `run-1` paths are absent.
 
 For trend and compare views, repeat aggregates should be the default unit. Attempt-level views can be added as a filter later, but they must not silently change run-level counts.
 
@@ -419,7 +426,7 @@ For trend and compare views, repeat aggregates should be the default unit. Attem
   "attempts": [
     {
       "run": 1,
-      "artifact_dir": "case-1/run-1",
+      "run_path": "run-1",
       "score": 0.9,
       "execution_status": "ok",
       "duration_ms": 118000
@@ -487,12 +494,12 @@ Repeat runs can multiply provider spend. V1 should ship with conservative contro
 
 **Files:** `packages/core/src/evaluation/run-artifacts.ts`, `packages/core/src/evaluation/result-row-schema.ts`, `apps/cli/src/commands/eval/artifact-writer.ts`, `apps/cli/src/commands/eval/result-layout.ts`, `apps/cli/test/commands/eval/artifact-writer.test.ts`, `apps/cli/test/commands/eval/aggregate.test.ts`, `apps/cli/test/commands/results/validate.test.ts`.
 
-**Approach:** Extend the artifact writer to understand aggregate results with attempt children. Keep single-run case sidecars direct, add case-local `summary.json` for repeat aggregates, and add optional repeat fields to index rows. Avoid putting full attempt payloads in `index.jsonl`.
+**Approach:** Extend the artifact writer to understand aggregate results with attempt children. Use case-local `summary.json` for every case, store single-run and repeated attempts under `run-N/`, and add optional repeat fields to index rows. Avoid putting full attempt payloads in `index.jsonl`.
 
 **Test Scenarios:**
 
-- Single-run output writes direct case-local sidecars and remains readable by existing manifest hydration.
-- Repeat-run output writes `summary.json` and `run-1/`, `run-2/` sidecars with correct relative paths.
+- Single-run output writes case `summary.json` plus `run-1/` sidecars and remains readable by existing manifest hydration.
+- Repeat-run output writes case `summary.json` and `run-1/`, `run-2/` sidecars with correct relative paths.
 - `index.jsonl` has one aggregate row per case/target and compact attempt references.
 - Historical rows without repeat fields parse successfully.
 - Validation reports missing attempt sidecars when a repeat summary points to absent `run-N` artifacts.
diff --git a/docs/plans/2026-06-23-002-experiments-separation-plan.md b/docs/plans/2026-06-23-002-experiments-separation-plan.md
index 7c626ecd1..da5889a8a 100644
--- a/docs/plans/2026-06-23-002-experiments-separation-plan.md
+++ b/docs/plans/2026-06-23-002-experiments-separation-plan.md
@@ -203,7 +203,7 @@ surface where compatibility is not required.
 ## Artifact Impact
 
 Existing artifact writers already accept an experiment label. Phase 1 should
-continue writing the resolved experiment name to `benchmark.json`, `index.jsonl`,
+continue writing the resolved experiment name to `summary.json`, `index.jsonl`,
 trace envelopes, and results repository paths.
 
 Later artifact work should add:
diff --git a/docs/plans/git-native-results.md b/docs/plans/git-native-results.md
index 853c036e5..a73d9be28 100644
--- a/docs/plans/git-native-results.md
+++ b/docs/plans/git-native-results.md
@@ -14,7 +14,7 @@ After comparing with **entireio** (single-ref + git tree as index) and **skillfu
 
 ## Core idea
 
-The configured results branch tree IS the index. `git ls-tree -r <storage-ref> -- runs/` lists every run path without reading every blob. `git cat-file --batch` reads existing `benchmark.json` blobs in one subprocess call. No separate index file. No drift. Natural pruning when runs are deleted. With `--filter=blob:none` clone, individual run blobs are fetched lazily when a user opens the detail view.
+The configured results branch tree IS the index. `git ls-tree -r <storage-ref> -- runs/` lists every run path without reading every blob. `git cat-file --batch` reads existing `summary.json` blobs in one subprocess call. No separate index file. No drift. Natural pruning when runs are deleted. With `--filter=blob:none` clone, individual run blobs are fetched lazily when a user opens the detail view.
 
 ## Architecture
 
@@ -59,7 +59,7 @@ Each run is one commit. Files are unique to that run, so rebases never content-c
 ### Reads
 
 **Listing** (replaces `listResultFilesFromRunsDir`):
-- `git ls-tree -r <storage-ref> -- runs/` → filter for `benchmark.json` paths
+- `git ls-tree -r <storage-ref> -- runs/` → filter for `summary.json` paths
 - `git cat-file --batch` → read those blobs in one subprocess
 - Derive `run_id` from path (same logic as current `buildRunId`)
 - Sort by timestamp descending
@@ -91,7 +91,7 @@ Each run is one commit. Files are unique to that run, so rebases never content-c
 - `normalizeResultsConfig()` accepts `repo_url`/legacy `repo` or `repo_path`, but prerelease docs and config examples use `repo_url` or `repo_path`.
 - `directPushResults()` resolves the results store, builds one storage-branch commit for the completed run, and pushes when `sync.auto_push` or `sync.require_push` is enabled.
 - `commitResultsRunWithTemporaryIndex()` writes blobs into the repo object database and updates the storage branch via a temporary index. This is the normal `repo_path: .` path and avoids copying files into a checked-out results branch.
-- `listGitRuns()` uses `git ls-tree` plus `git cat-file --batch` against `runs/**/benchmark.json`. A not-yet-created storage branch (ref does not exist) returns `[]` rather than throwing, so the Dashboard's remote-results poll stays quiet before the first push.
+- `listGitRuns()` uses `git ls-tree` plus `git cat-file --batch` against `runs/**/summary.json`. A not-yet-created storage branch (ref does not exist) returns `[]` rather than throwing, so the Dashboard's remote-results poll stays quiet before the first push.
 - `setupWipWorktree()` and `pushWipCheckpoint()` maintain recoverable in-progress branches under `agentv/wip/...`.
 
 ## Breaking changes
@@ -128,8 +128,8 @@ Breaking changes accepted because no production users yet. Document in release n
 ## What this PR does NOT do
 
 - Doesn't add a separate index file (the index IS the git tree)
-- Doesn't ship a `reindex` migration command (nothing to backfill — `benchmark.json` already exists per run)
-- Doesn't change the artifact format (`benchmark.json`, `index.jsonl`, per-test dirs stay as-is)
+- Doesn't ship a `reindex` migration command (nothing to backfill — `summary.json` already exists per run)
+- Doesn't change the artifact format (`summary.json`, `index.jsonl`, per-test dirs stay as-is)
 - Doesn't add server-side caching (deferred)
 - Doesn't add PR-based publishing (deferred)
 - Doesn't touch the source repo's normal branch history (only the configured results storage branch/repo)
diff --git a/docs/plans/results-storage-retention-oplog-plan.md b/docs/plans/results-storage-retention-oplog-plan.md
index 711d64ec9..e48fdbd27 100644
--- a/docs/plans/results-storage-retention-oplog-plan.md
+++ b/docs/plans/results-storage-retention-oplog-plan.md
@@ -28,11 +28,11 @@ The first research doc is load-bearing for the single-ref git model, compact der
 
 ## Problem Frame
 
-AgentV already has the beginning of a git-native results store. `packages/core/src/evaluation/results-repo.ts` defaults to `agentv/results/v1`, creates a deterministic orphan genesis commit, lists remote runs with `git ls-tree`, reads `benchmark.json` blobs through `git cat-file --batch`, publishes run trees without checking out the storage branch, and has support for the heavy artifact sidecar ref `agentv/artifacts/v1`.
+AgentV already has the beginning of a git-native results store. `packages/core/src/evaluation/results-repo.ts` defaults to `agentv/results/v1`, creates a deterministic orphan genesis commit, lists remote runs with `git ls-tree`, reads `summary.json` blobs through `git cat-file --batch`, publishes run trees without checking out the storage branch, and has support for the heavy artifact sidecar ref `agentv/artifacts/v1`.
 
 The next storage beads need one reviewed contract before implementation splits across retention, object storage, publication export, path-sharding assessment, and mutable operations. Without that contract, each bead could accidentally create its own branch layout, backend abstraction, transcript boundary, or dashboard read model.
 
-The product boundary stays unchanged: AgentV remains the repo-native and workspace-native source of truth for run artifacts. Object storage, SQLite, and publication exports are storage tiers, caches, or derived projections over AgentV artifacts. Phoenix is link-out correlation only when safe `external_trace` metadata points at independently emitted spans; it is not an AgentV artifact projection or storage tier. A `project` holds runs, traces, and experiments; a `benchmark` is a curated eval suite, and the per-run `benchmark.json` artifact keeps that artifact name.
+The product boundary stays unchanged: AgentV remains the repo-native and workspace-native source of truth for run artifacts. Object storage, SQLite, and publication exports are storage tiers, caches, or derived projections over AgentV artifacts. Phoenix is link-out correlation only when safe `external_trace` metadata points at independently emitted spans; it is not an AgentV artifact projection or storage tier. A `project` holds runs, traces, and experiments; a `benchmark` is a curated eval suite, and the per-run `summary.json` artifact keeps that artifact name.
 
 ---
 
@@ -65,7 +65,7 @@ The product boundary stays unchanged: AgentV remains the repo-native and workspa
 ### Retention, Export, And Oplog
 
 - R16. av-dcs implements prune/compact for git and hybrid modes, including migration of existing git-sidecar transcript payloads before compaction when hybrid object storage is configured.
-- R17. av-kxa implements compact publication as a regenerable derived export from `benchmark.json`, `index.jsonl`, and run artifacts, with no required `eval.txt`.
+- R17. av-kxa implements compact publication as a regenerable derived export from `summary.json`, `index.jsonl`, and run artifacts, with no required `eval.txt`.
 - R18. av-8un implements raw mutable operations as per-actor append-only segments, folds tags with add-wins semantics first, and later supports `run.delete` and `run.restore` tombstones.
 - R19. Dashboard reads materialized final state stored with run/result data, not raw oplog segments.
 
@@ -82,7 +82,7 @@ The product boundary stays unchanged: AgentV remains the repo-native and workspa
 
 - KTD1. Use `git-native`, `hybrid`, and `blob-native` as storage backend modes. The current `results.mode: github` is transport-era naming and should not become the long-term storage-mode field.
 - KTD2. Add new wire fields in snake_case, such as `storage_mode` and `object_store`, while keeping TypeScript internals in camelCase, such as `storageMode` and `objectStore`.
-- KTD3. Keep the git-backed listing contract centered on `benchmark.json` and `index.jsonl`. Do not add a new canonical SQL, JSON index, or `eval.txt` authoring surface.
+- KTD3. Keep the git-backed listing contract centered on `summary.json` and `index.jsonl`. Do not add a new canonical SQL, JSON index, or `eval.txt` authoring surface.
 - KTD4. Keep `agentv/results/v1`, `agentv/artifacts/v1`, and `agentv/oplog/v1` as sibling refs. Do not use refs like `agentv/results/v1/artifacts`.
 - KTD5. Use one artifact sidecar ref with typed path prefixes. Start with prefixes such as `transcripts/`, `traces/`, `raw-logs/`, `outputs/`, and `screenshots/`; do not use one branch per artifact type unless measurement later proves it is needed.
 - KTD6. Do not shard `runs/` in this spec. The lightweight measurement below shows current realistic scale is acceptable, and av-thr owns deeper profiling before any layout change.
@@ -108,7 +108,7 @@ The product boundary stays unchanged: AgentV remains the repo-native and workspa
 
 ## Lightweight Path-Sharding Measurement
 
-I ran a temp-only Git measurement on 2026-06-21 using the current `runs/<experiment>/<timestamp>/` shape. Each synthetic run had a small `benchmark.json` and `index.jsonl`; the measurement approximates `listGitRuns()` by listing tree paths and batch-reading all `benchmark.json` blobs.
+I ran a temp-only Git measurement on 2026-06-21 using the current `runs/<experiment>/<timestamp>/` shape. Each synthetic run had a small `summary.json` and `index.jsonl`; the measurement approximates `listGitRuns()` by listing tree paths and batch-reading all `summary.json` blobs.
 
 | Runs | `git ls-tree` | `git cat-file --batch` for benchmarks | Benchmark paths |
 | ---: | ---: | ---: | ---: |
@@ -145,7 +145,7 @@ flowchart TB
 
 | Mode | Listing/index strategy | Heavy artifact strategy | Raw oplog location | Retention strategy |
 | --- | --- | --- | --- | --- |
-| `git-native` | `git ls-tree` over `agentv/results/v1:runs/**`, batch-read `benchmark.json`, read `index.jsonl` for detail | `agentv/artifacts/v1` sidecar ref with typed prefixes | `agentv/oplog/v1` | av-dcs prune/compact refs plus eventual GitHub GC |
+| `git-native` | `git ls-tree` over `agentv/results/v1:runs/**`, batch-read `summary.json`, read `index.jsonl` for detail | `agentv/artifacts/v1` sidecar ref with typed prefixes | `agentv/oplog/v1` | av-dcs prune/compact refs plus eventual GitHub GC |
 | `hybrid` | Same git listing for metadata, benchmark, index, synopsis, and materialized state | Content-addressed B2 objects, with git pointers | `agentv/oplog/v1` unless explicitly moved later | av-dcs git compaction plus B2 lifecycle rules |
 | `blob-native` | Bucket manifest is the fast path; `ListObjectsV2` over stable prefixes is fallback/rebuild | All artifacts are B2 objects | Object-store oplog prefix | Bucket lifecycle, manifest compaction, and delete tombstones |
 
@@ -189,7 +189,7 @@ flowchart TB
 
 **Listing/index strategy by mode:**
 
-- `git-native`: `listGitRuns()` remains the fast path. It lists `runs/**/benchmark.json` on `agentv/results/v1`, batch-reads those JSON blobs, and uses `index.jsonl` only when callers need per-test detail.
+- `git-native`: `listGitRuns()` remains the fast path. It lists `runs/**/summary.json` on `agentv/results/v1`, batch-reads those JSON blobs, and uses `index.jsonl` only when callers need per-test detail.
 - `hybrid`: same listing as `git-native`; heavy payloads are not required for list views. Pointer records in `index.jsonl` decide whether detail reads use git sidecar or S3.
 - `blob-native`: maintain a bucket manifest, for example a compact run listing object under a stable prefix, as the fast path. Use paginated `ListObjectsV2` over `runs/` or manifest prefixes as a rebuild/fallback path. Manifest reads are cheaper for Dashboard; ListObjectsV2 is simpler but can become expensive and pagination-sensitive.
 
@@ -355,9 +355,9 @@ Current pointer construction and publish-time key rewrite are not fully aligned.
 - `apps/cli/src/commands/results/export.ts`
   - Keep `buildProjectionBundleFromExportedIndex()` as a pattern for derived exports.
   - Add av-kxa export entry points for rollup/synopsis output, either as flags on `results export` or a new subcommand if the UX is clearer.
-  - Read from `benchmark.json`, `index.jsonl`, and selected run artifacts.
+  - Read from `summary.json`, `index.jsonl`, and selected run artifacts.
 - New helper, suggested file `apps/cli/src/commands/results/publication-export.ts`
-  - Build per-eval rollup JSON from `benchmark.json` plus `index.jsonl`.
+  - Build per-eval rollup JSON from `summary.json` plus `index.jsonl`.
   - Build per-test/run synopsis records with status, duration, pass/fail, score, target, and artifact pointers.
   - Keep output compact enough for static sites and leaderboard-like views.
 - `apps/cli/src/commands/eval/artifact-writer.ts`
@@ -373,7 +373,7 @@ Current pointer construction and publish-time key rewrite are not fully aligned.
 
 **No required `eval.txt`:**
 
-`benchmark.json` already carries eval/run metadata, and `index.jsonl` is the durable per-test contract. If a human-readable label is useful, derive it in the export from existing metadata. Do not require an authored or generated `eval.txt` file.
+`summary.json` already carries eval/run metadata, and `index.jsonl` is the durable per-test contract. If a human-readable label is useful, derive it in the export from existing metadata. Do not require an authored or generated `eval.txt` file.
 
 **Acceptance criteria:**
 
@@ -547,7 +547,7 @@ Object keys should be content-addressed, such as `sha256/<hash>` under an AgentV
 - **Child refs under `agentv/results/v1/*`:** Rejected because Git refs are path-like and a ref cannot safely coexist with child refs beneath the same path.
 - **Branch-per-artifact-type sidecars:** Rejected until measurement proves a need. One `agentv/artifacts/v1` ref with typed prefixes is simpler to validate, fetch, compact, and migrate.
 - **Canonical `transcript.json`:** Rejected because it duplicates `outputs/transcript.jsonl` and creates drift risk. JSON object summaries should be derived indexes or synopses with different names.
-- **Required `eval.txt`:** Rejected because `benchmark.json` and `index.jsonl` already carry the durable machine-readable contract, and publication is derived.
+- **Required `eval.txt`:** Rejected because `summary.json` and `index.jsonl` already carry the durable machine-readable contract, and publication is derived.
 - **B2-native APIs:** Rejected because Backblaze B2's S3-compatible endpoint lets AgentV use standard S3 clients, MinIO CI, and portable user configuration.
 - **Vercel Blob as a dependency:** Rejected because it is provider-specific and weaker than the desired content-addressed private bucket model.
 - **SQLite as canonical storage:** Rejected because it would move AgentV away from portable run artifacts. SQLite is a rebuildable projection only.
@@ -563,7 +563,7 @@ Object keys should be content-addressed, such as `sha256/<hash>` under an AgentV
 - **Blob-native listing scalability:** Use bucket manifests for fast Dashboard reads and keep paginated `ListObjectsV2` as rebuild/fallback.
 - **Oplog fold ambiguity:** Start with tags and deterministic add-wins semantics; defer richer CRDTs until a concrete mutable field needs them.
 - **Stale materialized state:** Store oplog watermarks and surface stale state as sync/reconcile status.
-- **Overloading project/benchmark terms:** Keep project registry language separate from per-run `benchmark.json` artifacts.
+- **Overloading project/benchmark terms:** Keep project registry language separate from per-run `summary.json` artifacts.
 
 ---
 
@@ -578,7 +578,7 @@ For downstream implementation beads:
 
 - av-dsc: run unit tests for config parsing, object-store client, pointer verification, MinIO integration, and real B2 dogfood when credentials are present.
 - av-dcs: run tmp-git integration tests proving pruned runs disappear, retained runs list and open in Dashboard, migration verifies checksums, compaction is idempotent, and source worktrees are not switched.
-- av-kxa: test compact export determinism from `benchmark.json`, `index.jsonl`, and run artifacts; prove no required `eval.txt`.
+- av-kxa: test compact export determinism from `summary.json`, `index.jsonl`, and run artifacts; prove no required `eval.txt`.
 - av-8un: test per-actor append, no-conflict publishing, tag fold ordering, add-wins concurrent add/remove, `run.delete`/`run.restore` tombstones, materialized watermark staleness, and migration from existing tag overlays.
 - av-thr: repeat the path-sharding measurement with realistic artifacts, cold/warm partial clone cases, list/detail Dashboard routes, 50-200 run realistic volumes, and low-thousands stress cases.
 - av-7uu and av-kve.5: test that list/aggregate paths do not read transcript bodies or trace sidecars and that SQLite can be deleted/rebuilt.
diff --git a/docs/plans/trace-envelope-implementation-spec.md b/docs/plans/trace-envelope-implementation-spec.md
index 1d1c800bb..5bf74b0a3 100644
--- a/docs/plans/trace-envelope-implementation-spec.md
+++ b/docs/plans/trace-envelope-implementation-spec.md
@@ -43,7 +43,7 @@ Source of truth:
 Non-goals:
 
 - Do not invent an AgentV-specific canonical trace graph.
-- Do not change existing public result JSONL, `benchmark.json`, `grading.json`,
+- Do not change existing public result JSONL, `summary.json`, `grading.json`,
   `timing.json`, replay fixture JSONL, or `outputs/transcript.jsonl` schemas in
   the first implementation slice.
 - Do not build Phoenix, Langfuse, Braintrust, or LangSmith vendor adapters here.
diff --git a/docs/solutions/architecture-patterns/separate-eval-tasks-from-experiment-runtime.md b/docs/solutions/architecture-patterns/separate-eval-tasks-from-experiment-runtime.md
index 849024d49..26a811005 100644
--- a/docs/solutions/architecture-patterns/separate-eval-tasks-from-experiment-runtime.md
+++ b/docs/solutions/architecture-patterns/separate-eval-tasks-from-experiment-runtime.md
@@ -111,7 +111,7 @@ tests:
 ```text
 .agentv/results/<experiment>/<timestamp>/
   index.jsonl
-  benchmark.json
+  summary.json
   timing.json
   <suite>/<case>/
     task/PROMPT.md
diff --git a/examples/features/sdk-programmatic-api/README.md b/examples/features/sdk-programmatic-api/README.md
index 178abb3cb..d162ed3fa 100644
--- a/examples/features/sdk-programmatic-api/README.md
+++ b/examples/features/sdk-programmatic-api/README.md
@@ -26,4 +26,4 @@ bun run evaluate.ts
 - **Inline tests** — define YAML-shaped tests directly in TypeScript
 - **Config mirrors YAML** — same evaluation model, with programmatic `assert` and camelCase fields
 - **Typed results** — `EvalRunResult` with summary statistics
-- **Canonical artifacts** — opt into the same `index.jsonl` / `benchmark.json` workspace layout as `agentv eval`
+- **Canonical artifacts** — opt into the same `index.jsonl` / `summary.json` workspace layout as `agentv eval`
diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts
index f05fc1231..be48b831a 100644
--- a/packages/core/src/evaluation/evaluate.ts
+++ b/packages/core/src/evaluation/evaluate.ts
@@ -205,7 +205,7 @@ export interface EvalConfig {
   readonly budgetUsd?: number;
   /** Optional run workspace directory for canonical AgentV artifacts. */
   readonly outputDir?: string;
-  /** Optional experiment name recorded in benchmark.json and index.jsonl. */
+  /** Optional experiment name recorded in summary.json and index.jsonl. */
   readonly experiment?: string;
 }
 
@@ -256,8 +256,7 @@ export interface EvalRunResult {
 export interface EvalRunArtifacts {
   readonly runDir: string;
   readonly indexPath: string;
-  readonly benchmarkPath: string;
-  readonly timingPath: string;
+  readonly summaryPath: string;
 }
 
 /**
@@ -386,11 +385,10 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
         evalFile: config.specFile ? testFilePath : '',
         experiment: config.experiment,
         sourceTests: materialized.tests,
-      }).then(({ benchmarkPath, indexPath, timingPath }) => ({
+      }).then(({ summaryPath, indexPath }) => ({
         runDir: outputDir,
-        benchmarkPath,
+        summaryPath,
         indexPath,
-        timingPath,
       }))
     : undefined;
 
diff --git a/packages/core/src/evaluation/result-row-schema.ts b/packages/core/src/evaluation/result-row-schema.ts
index 4fafbe960..709684547 100644
--- a/packages/core/src/evaluation/result-row-schema.ts
+++ b/packages/core/src/evaluation/result-row-schema.ts
@@ -21,7 +21,6 @@ const MIGRATION_GUIDANCE =
 const RESULT_ROW_ALIASES = {
   answerPath: 'answer_path',
   artifactDir: 'artifact_dir',
-  benchmarkPath: 'benchmark_path',
   conversationId: 'conversation_id',
   costUsd: 'cost_usd',
   durationMs: 'duration_ms',
@@ -178,7 +177,7 @@ function looksLikeResultRow(value: Record<string, unknown>): boolean {
     Object.hasOwn(value, 'trace') ||
     Object.hasOwn(value, 'spans') ||
     Object.hasOwn(value, 'target') ||
-    Object.hasOwn(value, 'benchmark_path') ||
+    Object.hasOwn(value, 'summary_path') ||
     Object.hasOwn(value, 'grading_path') ||
     Object.hasOwn(value, 'timing_path')
   );
diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts
index 5c0d38372..d1de4a70e 100644
--- a/packages/core/src/evaluation/results-repo.ts
+++ b/packages/core/src/evaluation/results-repo.ts
@@ -53,7 +53,6 @@ const GIT_ENV_INHERIT_ALLOWLIST = new Set([
   'GIT_USERNAME',
 ]);
 export const DEFAULT_RESULTS_BRANCH = AGENTV_RESULTS_PRIMARY_REF;
-const MANAGED_RESULTS_REMOTE = 'agentv-results';
 const GIT_EMPTY_TREE = '4b825dc642cb6eb9a060e54bf8d69288fbee4904';
 // The results branch is a self-rooted orphan whose first commit is a fixed,
 // byte-identical empty-tree genesis. Pinning the message, identity, and dates
@@ -383,9 +382,7 @@ export function normalizeResultsConfig(
   const repo = repoUrl ?? repoPath ?? '';
   const branch = config.branch?.trim() || (repoPath ? DEFAULT_RESULTS_BRANCH : undefined);
   const useStorageBranchWorktree = Boolean(repoPath || (repoUrl && explicitClonePath && branch));
-  const remote =
-    config.remote?.trim() ||
-    (repoUrl && useStorageBranchWorktree ? MANAGED_RESULTS_REMOTE : 'origin');
+  const remote = config.remote?.trim() || 'origin';
   const autoPush = config.sync?.auto_push ?? config.auto_push === true;
   const requirePush = config.sync?.require_push === true;
   const configuredPushConflictPolicy = (
@@ -908,36 +905,6 @@ async function addResultsGitattributesToIndex(
   }
 }
 
-async function ensureResultsRepoRemote(
-  repoDir: string,
-  config: NormalizedResultsConfig,
-): Promise<void> {
-  if (!config.repo_url) {
-    return;
-  }
-
-  const remoteUrl = resolveResultsRepoUrl(config.repo_url);
-  // Same-repo results push to the project's existing remote (typically
-  // `origin`). Never rewrite that remote to a value that is not a real Git
-  // remote URL (e.g. a local results path or `.`), which would otherwise
-  // clobber a correct origin with a synthesized URL.
-  if (!isExplicitRemoteUrl(remoteUrl)) {
-    return;
-  }
-  const { stdout } = await runGit(['remote', 'get-url', config.remote], {
-    cwd: repoDir,
-    check: false,
-  });
-  const existingUrl = stdout.trim();
-  if (!existingUrl) {
-    await runGit(['remote', 'add', config.remote, remoteUrl], { cwd: repoDir });
-    return;
-  }
-  if (existingUrl !== remoteUrl) {
-    await runGit(['remote', 'set-url', config.remote, remoteUrl], { cwd: repoDir });
-  }
-}
-
 function updateStatusFile(
   config: ResultsConfig | NormalizedResultsConfig,
   patch: PersistedStatus,
@@ -977,10 +944,10 @@ export async function ensureResultsRepoClone(config: ResultsConfig): Promise<str
       await runGit([
         'clone',
         '--filter=blob:none',
+        ...(normalized.remote === 'origin' ? [] : ['--origin', normalized.remote]),
         resolveResultsRepoUrl(normalized.repo_url ?? normalized.repo),
         cloneDir,
       ]);
-      await ensureResultsRepoRemote(cloneDir, normalized);
       await ensureResultsMergeConfig(cloneDir);
       return cloneDir;
     } catch (error) {
@@ -993,7 +960,6 @@ export async function ensureResultsRepoClone(config: ResultsConfig): Promise<str
     throw new Error(`Results repo clone path is not a git repository: ${cloneDir}`);
   }
 
-  await ensureResultsRepoRemote(cloneDir, normalized);
   await ensureResultsMergeConfig(cloneDir);
   return cloneDir;
 }
@@ -2128,7 +2094,6 @@ export async function getResultsRepoSyncStatus(config?: ResultsConfig): Promise<
   }
 
   try {
-    await ensureResultsRepoRemote(normalized.path, normalized);
     if (usesStorageBranchWorktree(normalized)) {
       await fetchResultsRepo(normalized.path, normalized.remote, normalized.branch).catch(
         () => undefined,
@@ -3790,7 +3755,7 @@ export interface GitListedRun {
   pass_rate?: number;
   target?: string;
   manifest_path: string;
-  benchmark_path: string;
+  summary_path?: string;
   display_name: string;
   test_count: number;
   avg_score: number;
@@ -3802,7 +3767,7 @@ type GitBatchBlob = {
   readonly content: Buffer;
 };
 
-type GitRunBenchmark = {
+type GitRunSummary = {
   readonly metadata?: {
     readonly display_name?: string;
     readonly timestamp?: string;
@@ -3832,8 +3797,8 @@ function buildGitRunId(relativeRunPath: string): string {
   return segments[0] ?? relativeRunPath;
 }
 
-function getRunExperiment(runId: string, benchmark: GitRunBenchmark): string {
-  const experiment = benchmark.metadata?.experiment?.trim();
+function getRunExperiment(runId: string, summary: GitRunSummary | undefined): string {
+  const experiment = summary?.metadata?.experiment?.trim();
   if (experiment) {
     return experiment;
   }
@@ -3842,7 +3807,7 @@ function getRunExperiment(runId: string, benchmark: GitRunBenchmark): string {
   return separatorIndex === -1 ? 'default' : runId.slice(0, separatorIndex);
 }
 
-function computeAveragePassRate(runSummary: GitRunBenchmark['run_summary']): number | undefined {
+function computeAveragePassRate(runSummary: GitRunSummary['run_summary']): number | undefined {
   if (!runSummary) {
     return undefined;
   }
@@ -4158,45 +4123,97 @@ export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise
     throw error;
   }
 
-  const benchmarkPaths = treeOut
+  const treePaths = treeOut
     .split(/\r?\n/)
     .map((line) => line.trim())
-    .filter((line) => line.endsWith('/benchmark.json'));
-  if (benchmarkPaths.length === 0) {
+    .filter(Boolean);
+  const indexPaths = treePaths.filter((line) => line.endsWith('/index.jsonl'));
+  if (indexPaths.length === 0) {
     return [];
   }
 
-  const batchInput = `${benchmarkPaths.map((benchmarkPath) => `${ref}:${benchmarkPath}`).join('\n')}\n`;
+  const batchInput = `${indexPaths.map((indexPath) => `${ref}:${indexPath}`).join('\n')}\n`;
   const blobs = parseGitBatchBlobs(await runGitBatch(repoDir, batchInput));
-  if (blobs.length !== benchmarkPaths.length) {
+  if (blobs.length !== indexPaths.length) {
     throw new Error(
-      `Expected ${benchmarkPaths.length} git blobs but received ${blobs.length} while listing results runs`,
+      `Expected ${indexPaths.length} git blobs but received ${blobs.length} while listing results runs`,
     );
   }
 
+  const summaryPaths = indexPaths
+    .map((indexPath) => path.posix.join(path.posix.dirname(indexPath), 'summary.json'))
+    .filter((summaryPath) => treePaths.includes(summaryPath));
+  const summaryByPath = new Map<string, GitRunSummary>();
+  if (summaryPaths.length > 0) {
+    const summaryBatchInput = `${summaryPaths.map((summaryPath) => `${ref}:${summaryPath}`).join('\n')}\n`;
+    const summaryBlobs = parseGitBatchBlobs(await runGitBatch(repoDir, summaryBatchInput));
+    for (let i = 0; i < summaryBlobs.length; i++) {
+      const summaryPath = summaryPaths[i];
+      const blob = summaryBlobs[i];
+      if (!summaryPath || !blob) continue;
+      summaryByPath.set(summaryPath, JSON.parse(blob.content.toString('utf8')) as GitRunSummary);
+    }
+  }
+
   const runs = blobs.flatMap((blob, index): GitListedRun[] => {
-    const benchmarkPath = benchmarkPaths[index];
-    const benchmark = JSON.parse(blob.content.toString('utf8')) as GitRunBenchmark;
-    const runDir = path.posix.dirname(benchmarkPath);
+    const manifestPath = indexPaths[index];
+    const runDir = path.posix.dirname(manifestPath);
+    const summaryPath = path.posix.join(runDir, 'summary.json');
+    const summary = summaryByPath.get(summaryPath);
     const relativeRunPath = path.posix.relative(RESULTS_REPO_RUNS_DIR, runDir);
     const runId = buildGitRunId(relativeRunPath);
-    const timestamp = benchmark.metadata?.timestamp?.trim() || path.posix.basename(runDir);
-    const displayName = benchmark.metadata?.display_name?.trim() || path.posix.basename(runDir);
-    const targets = benchmark.metadata?.targets ?? [];
-    const passRate = computeAveragePassRate(benchmark.run_summary);
+    const rows = blob.content
+      .toString('utf8')
+      .split(/\r?\n/)
+      .map((line) => line.trim())
+      .filter(Boolean)
+      .flatMap(
+        (line): { timestamp?: string; target?: string; test_id?: string; score?: number }[] => {
+          try {
+            return [
+              JSON.parse(line) as {
+                timestamp?: string;
+                target?: string;
+                test_id?: string;
+                score?: number;
+              },
+            ];
+          } catch {
+            return [];
+          }
+        },
+      );
+    const rowTargets = [
+      ...new Set(rows.map((row) => row.target).filter((target): target is string => !!target)),
+    ];
+    const rowTestIds = [
+      ...new Set(rows.map((row) => row.test_id).filter((testId): testId is string => !!testId)),
+    ];
+    const rowScores = rows
+      .map((row) => row.score)
+      .filter((score): score is number => typeof score === 'number' && Number.isFinite(score));
+    const avgScore =
+      rowScores.length > 0
+        ? rowScores.reduce((sum, score) => sum + score, 0) / rowScores.length
+        : 0;
+    const timestamp =
+      summary?.metadata?.timestamp?.trim() || rows[0]?.timestamp || path.posix.basename(runDir);
+    const displayName = summary?.metadata?.display_name?.trim() || path.posix.basename(runDir);
+    const targets = summary?.metadata?.targets ?? rowTargets;
+    const passRate = computeAveragePassRate(summary?.run_summary) ?? avgScore;
 
     return [
       {
         run_id: runId,
-        experiment: getRunExperiment(runId, benchmark),
+        experiment: getRunExperiment(runId, summary),
         timestamp,
         ...(passRate !== undefined && { pass_rate: passRate }),
         ...(targets.length === 1 && targets[0] ? { target: targets[0] } : {}),
-        manifest_path: path.posix.join(runDir, 'index.jsonl'),
-        benchmark_path: benchmarkPath,
+        manifest_path: manifestPath,
+        ...(summaryByPath.has(summaryPath) && { summary_path: summaryPath }),
         display_name: displayName,
-        test_count: benchmark.metadata?.tests_run?.length ?? 0,
-        avg_score: 0,
+        test_count: summary?.metadata?.tests_run?.length ?? rowTestIds.length,
+        avg_score: avgScore,
         size_bytes: blob.size,
       },
     ];
diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts
index c28cf6b11..9d23fb2b5 100644
--- a/packages/core/src/evaluation/run-artifacts.ts
+++ b/packages/core/src/evaluation/run-artifacts.ts
@@ -2,9 +2,9 @@
  * Canonical AgentV run artifact helpers.
  *
  * This module owns the shared run-workspace contract used by CLI and
- * programmatic evals: `index.jsonl`, `benchmark.json`, `timing.json`, per-test
- * grading/timing/output sidecars, and transcript projections. Keep wire keys in
- * snake_case here so every caller produces the same artifacts.
+ * programmatic evals: `index.jsonl`, run-root `summary.json`, per-case
+ * `summary.json`, `run-N/result.json`, and transcript projections. Keep wire
+ * keys in snake_case here so every caller produces the same artifacts.
  */
 
 import { createHash } from 'node:crypto';
@@ -63,6 +63,7 @@ import type {
 } from './types.js';
 
 export const RESULT_INDEX_FILENAME = 'index.jsonl';
+export const RUN_SUMMARY_FILENAME = 'summary.json';
 
 const TIMING_SOURCE_VALUES = [
   'provider_reported',
@@ -102,36 +103,33 @@ export async function aggregateRunDir(
     plannedTestCount?: number;
     experimentMetadata?: ExperimentArtifactMetadata;
   },
-): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
+): Promise<{ summaryPath: string; testCount: number; targetCount: number }> {
   const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
   const content = await readFile(indexPath, 'utf8');
   const allResults = parseJsonlResults(content);
   const results = deduplicateByTestIdTarget(allResults);
 
-  const timing = buildTimingArtifact(results);
-  const timingPath = path.join(runDir, 'timing.json');
-  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
-
   const plannedTestCount =
-    options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json')));
+    options?.plannedTestCount ??
+    (await readPlannedTestCount(path.join(runDir, RUN_SUMMARY_FILENAME)));
 
-  const benchmark = buildBenchmarkArtifact(
+  const summary = buildRunSummaryArtifact(
     results,
     options?.evalFile,
     options?.experiment,
     plannedTestCount,
     options?.experimentMetadata,
   );
-  const benchmarkPath = path.join(runDir, 'benchmark.json');
-  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
+  const summaryPath = path.join(runDir, RUN_SUMMARY_FILENAME);
+  await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
 
   const targetSet = new Set(results.map((r) => r.target ?? 'unknown'));
-  return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
+  return { summaryPath, testCount: results.length, targetCount: targetSet.size };
 }
 
-async function readPlannedTestCount(benchmarkPath: string): Promise<number | undefined> {
+async function readPlannedTestCount(summaryPath: string): Promise<number | undefined> {
   try {
-    const raw = await readFile(benchmarkPath, 'utf8');
+    const raw = await readFile(summaryPath, 'utf8');
     const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } };
     const value = parsed.metadata?.planned_test_count;
     return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
@@ -234,7 +232,7 @@ export interface TimingArtifact {
   };
 }
 
-export interface BenchmarkArtifact {
+export interface RunSummaryArtifact {
   readonly metadata: {
     readonly eval_file: string;
     readonly timestamp: string;
@@ -255,6 +253,7 @@ export interface BenchmarkArtifact {
     }
   >;
   readonly per_grader_summary?: Record<string, { readonly mean: number; readonly stddev: number }>;
+  readonly timing: TimingArtifact;
   readonly notes: readonly string[];
 }
 
@@ -298,7 +297,6 @@ export interface IndexArtifactEntry {
   readonly artifact_dir?: string;
   readonly grading_path?: string;
   readonly timing_path?: string;
-  readonly benchmark_path?: string;
   readonly summary_path?: string;
   readonly output_path?: string;
   readonly answer_path?: string;
@@ -545,6 +543,10 @@ function toTrialArtifacts(
   }));
 }
 
+function toIndexTrialArtifacts(result: EvaluationResult): readonly TrialResultArtifact[] {
+  return toTrialArtifacts(result.trials) ?? toTrialArtifacts([singleRunTrial(result)]) ?? [];
+}
+
 function toTrialAggregationArtifact(
   aggregation: TrialAggregation | undefined,
 ): TrialAggregationArtifact | undefined {
@@ -682,8 +684,13 @@ function buildRepeatCaseSummaryArtifact(
   fingerprint?: string,
 ): RepeatCaseSummaryArtifact {
   const trials = result.trials ?? [];
-  const totalRuns = trials.length;
-  const passedRuns = trials.filter((trial) => trial.verdict === 'pass').length;
+  const totalRuns = trials.length > 0 ? trials.length : 1;
+  const passedRuns =
+    trials.length > 0
+      ? trials.filter((trial) => trial.verdict === 'pass').length
+      : result.executionStatus !== 'execution_error' && result.score >= DEFAULT_THRESHOLD
+        ? 1
+        : 0;
   const fallbackMeanMs = totalRuns > 0 ? roundMillis(timing.duration_ms / totalRuns) : 0;
   const meanDurationMs = timing.mean_duration_ms ?? fallbackMeanMs;
 
@@ -754,6 +761,29 @@ function buildVercelRunResultArtifact(params: {
   }) as unknown as VercelRunResultArtifact;
 }
 
+function singleRunTrial(result: EvaluationResult): TrialResult {
+  return {
+    attempt: 0,
+    score: result.score,
+    verdict:
+      result.executionStatus !== 'execution_error' && result.score >= DEFAULT_THRESHOLD
+        ? 'pass'
+        : 'fail',
+    scores: result.scores,
+    error: result.error,
+    costUsd: result.costUsd,
+    executionStatus: result.executionStatus,
+    failureStage: result.failureStage,
+    failureReasonCode: result.failureReasonCode,
+    result,
+  };
+}
+
+function materializedRunTrials(result: EvaluationResult): readonly TrialResult[] {
+  const persisted = (result.trials ?? []).filter((trial) => trial.result !== undefined);
+  return persisted.length > 0 ? persisted : [singleRunTrial(result)];
+}
+
 async function writeTrialRunArtifacts(params: {
   readonly trial: TrialResult;
   readonly parentTestDir: string;
@@ -771,9 +801,11 @@ async function writeTrialRunArtifacts(params: {
 
   const runDirName = trialRunDirName(params.trial.attempt);
   const runDir = path.join(params.parentTestDir, runDirName);
-  const grading = buildGradingArtifact(result);
+  const grading = buildGradingArtifact(result, { includeTrials: false });
   const timing = buildTimingArtifact([result]);
   const gradingPath = path.join(runDir, 'grading.json');
+  const timingPath = path.join(runDir, 'timing.json');
+  const metricsPath = path.join(runDir, CANONICAL_METRICS_ARTIFACT_PATH);
   const outputsDir = path.join(runDir, 'outputs');
   const answerOutputPath =
     result.output.length > 0 ? path.join(outputsDir, 'answer.md') : undefined;
@@ -795,11 +827,16 @@ async function writeTrialRunArtifacts(params: {
 
   await mkdir(runDir, { recursive: true });
   await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}\n`, 'utf8');
+  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
 
   await mkdir(outputsDir, { recursive: true });
   if (answerOutputPath) {
     await writeFile(answerOutputPath, result.output, 'utf8');
   }
+  const rawProviderLogSource = rawProviderLogSourcePath(result);
+  if (rawProviderLogSource) {
+    await copyRawProviderLogArtifact(rawProviderLogSource, runDir);
+  }
   if (transcriptPath && transcriptRawPath) {
     await writeFile(
       transcriptPath,
@@ -808,12 +845,14 @@ async function writeTrialRunArtifacts(params: {
     );
     await writeTranscriptJsonl(transcriptRawPath, result, envelope);
   }
-  const metricsArtifact = buildMetricsArtifactPayload({
+  const metricsArtifact = await writeMetricsArtifact({
+    filePath: metricsPath,
     result,
     envelope,
     traceArtifactPath: 'transcript.json',
     transcriptArtifactPath: transcriptRawPath ? 'transcript-raw.jsonl' : undefined,
-    timingArtifactPath: null,
+    gradingArtifactPath: 'grading.json',
+    timingArtifactPath: 'timing.json',
     timing,
   });
 
@@ -902,11 +941,15 @@ function buildExportMetadata(
   };
 }
 
-export function buildGradingArtifact(result: EvaluationResult): GradingArtifact {
+export function buildGradingArtifact(
+  result: EvaluationResult,
+  options?: { includeTrials?: boolean },
+): GradingArtifact {
   const assertions = buildAssertions(result);
   const passed = assertions.filter((e) => e.passed).length;
   const failed = assertions.filter((e) => !e.passed).length;
   const total = assertions.length;
+  const includeTrials = options?.includeTrials ?? true;
 
   return {
     assertions,
@@ -925,8 +968,8 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
           conversation_id: result.conversationId,
         }
       : undefined,
-    trials: toTrialArtifacts(result.trials),
-    aggregation: toTrialAggregationArtifact(result.aggregation),
+    trials: includeTrials ? toIndexTrialArtifacts(result) : undefined,
+    aggregation: includeTrials ? toTrialAggregationArtifact(result.aggregation) : undefined,
   };
 }
 
@@ -1059,13 +1102,13 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin
   };
 }
 
-export function buildBenchmarkArtifact(
+export function buildRunSummaryArtifact(
   results: readonly EvaluationResult[],
   evalFile = '',
   experiment?: string,
   plannedTestCount?: number,
   experimentMetadata?: ExperimentArtifactMetadata,
-): BenchmarkArtifact {
+): RunSummaryArtifact {
   const targetSet = new Set<string>();
   const testIdSet = new Set<string>();
   for (const result of results) {
@@ -1076,7 +1119,7 @@ export function buildBenchmarkArtifact(
   const targets = [...targetSet].sort();
   const testIds = [...testIdSet].sort();
 
-  const runSummary: BenchmarkArtifact['run_summary'] = {};
+  const runSummary: RunSummaryArtifact['run_summary'] = {};
   const notes: string[] = [];
 
   for (const target of targets) {
@@ -1160,11 +1203,12 @@ export function buildBenchmarkArtifact(
     },
     run_summary: runSummary,
     per_grader_summary: perEvaluatorSummary,
+    timing: buildTimingArtifact(results),
     notes,
   };
 }
 
-export async function writeInitialBenchmarkArtifact(
+export async function writeInitialRunSummaryArtifact(
   runDir: string,
   options: {
     evalFile: string;
@@ -1174,15 +1218,15 @@ export async function writeInitialBenchmarkArtifact(
   },
 ): Promise<void> {
   await mkdir(runDir, { recursive: true });
-  const stub = buildBenchmarkArtifact(
+  const stub = buildRunSummaryArtifact(
     [],
     options.evalFile,
     options.experiment,
     options.plannedTestCount,
     options.experimentMetadata,
   );
-  const benchmarkPath = path.join(runDir, 'benchmark.json');
-  await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
+  const summaryPath = path.join(runDir, RUN_SUMMARY_FILENAME);
+  await writeFile(summaryPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
 }
 
 export function buildAggregateGradingArtifact(
@@ -1243,56 +1287,6 @@ function buildArtifactSubdir(result: EvaluationResult): string {
   return path.posix.join(...segments);
 }
 
-function formatOutputMarkdown(output: readonly { role: string; content?: unknown }[]): string {
-  return output.map((msg) => `@[${msg.role}]:\n${String(msg.content ?? '')}`).join('\n\n');
-}
-
-function formatInputValue(input: unknown): string | null {
-  if (!input) return null;
-  if (typeof input === 'string') return input;
-  if (Array.isArray(input) && input.length > 0) {
-    return formatOutputMarkdown(input as { role: string; content?: unknown }[]);
-  }
-  return null;
-}
-
-function extractInput(result: EvaluationResult): string | null {
-  return formatInputValue((result as unknown as Record<string, unknown>).input);
-}
-
-function extractTracePrompt(result: EvaluationResult): string | null {
-  const trace = (result as unknown as { trace?: { messages?: unknown } }).trace;
-  const messages = Array.isArray(trace?.messages) ? trace.messages : [];
-  const promptMessages: { role: string; content?: unknown }[] = [];
-  for (const message of messages) {
-    if (!isRecord(message) || typeof message.role !== 'string') continue;
-    if (message.role === 'assistant') break;
-    if (message.role === 'system' || message.role === 'user' || message.role === 'developer') {
-      promptMessages.push({ role: message.role, content: message.content });
-    }
-  }
-  return promptMessages.length > 0 ? formatOutputMarkdown(promptMessages) : null;
-}
-
-function extractPrompt(result: EvaluationResult, sourceTest?: EvalTest): string | null {
-  const input = extractInput(result);
-  if (input) return input;
-  const traceInput = extractTracePrompt(result);
-  if (traceInput) return traceInput;
-  for (const trial of result.trials ?? []) {
-    if (!trial.result) continue;
-    const trialInput = extractInput(trial.result);
-    if (trialInput) return trialInput;
-    const trialTraceInput = extractTracePrompt(trial.result);
-    if (trialTraceInput) return trialTraceInput;
-  }
-  const sourceInput = sourceTest
-    ? formatInputValue((sourceTest as unknown as Record<string, unknown>).input)
-    : null;
-  if (sourceInput) return sourceInput;
-  return null;
-}
-
 function toRelativeArtifactPath(outputDir: string, filePath: string): string {
   return path.relative(outputDir, filePath).split(path.sep).join('/');
 }
@@ -1322,12 +1316,8 @@ function rawProviderLogSourcePath(result: EvaluationResult): string | undefined
   return sourcePath ? sourcePath : undefined;
 }
 
-function rawProviderLogArtifactPath(testDir: string): string {
-  return path.join(testDir, 'provider.log');
-}
-
 async function copyRawProviderLogArtifact(sourcePath: string, testDir: string): Promise<string> {
-  const destinationPath = rawProviderLogArtifactPath(testDir);
+  const destinationPath = path.join(testDir, 'provider.log');
   if (path.resolve(sourcePath) === path.resolve(destinationPath)) {
     return destinationPath;
   }
@@ -1451,7 +1441,6 @@ export function buildIndexArtifactEntry(
     artifactDir?: string;
     gradingPath?: string;
     timingPath?: string;
-    benchmarkPath?: string;
     summaryPath?: string;
     outputPath?: string;
     answerPath?: string;
@@ -1460,7 +1449,6 @@ export function buildIndexArtifactEntry(
     metricsPath?: string;
     artifactPointers?: ResultArtifactPointersWire;
     rawProviderLogPath?: string;
-    inputPath?: string;
     extraIndexFields?: AdditionalResultIndexFields;
     projectionIdentity?: ProjectionIdentity;
     duplicatePolicy?: ExportDuplicatePolicy;
@@ -1480,7 +1468,7 @@ export function buildIndexArtifactEntry(
     start_time: result.startTime,
     end_time: result.endTime,
     scores: toIndexScores(result.scores),
-    trials: toTrialArtifacts(result.trials),
+    trials: toIndexTrialArtifacts(result),
     aggregation: toTrialAggregationArtifact(result.aggregation),
     execution_status: result.executionStatus,
     error: result.error,
@@ -1496,9 +1484,6 @@ export function buildIndexArtifactEntry(
     timing_path: options.timingPath
       ? toRelativeArtifactPath(options.outputDir, options.timingPath)
       : undefined,
-    benchmark_path: options.benchmarkPath
-      ? toRelativeArtifactPath(options.outputDir, options.benchmarkPath)
-      : undefined,
     summary_path: options.summaryPath
       ? toRelativeArtifactPath(options.outputDir, options.summaryPath)
       : undefined,
@@ -1521,9 +1506,6 @@ export function buildIndexArtifactEntry(
       ? toRelativeArtifactPath(options.outputDir, options.rawProviderLogPath)
       : undefined,
     artifact_pointers: options.artifactPointers,
-    input_path: options.inputPath
-      ? toRelativeArtifactPath(options.outputDir, options.inputPath)
-      : undefined,
     ...options.extraIndexFields,
     external_trace: toIndexExternalTrace(result, options.projectionIdentity?.dimensions.runId),
     projection_identity: options.projectionIdentity
@@ -1544,10 +1526,10 @@ export function buildResultIndexArtifact(
   },
 ): ResultIndexArtifact {
   const artifactSubdir = buildArtifactSubdir(result);
-  const input = extractPrompt(result);
   const hasAnswer = result.output.length > 0;
   const hasTranscript = resultHasExecutionTraceTranscript(result);
-  const hasRawProviderLog = rawProviderLogSourcePath(result) !== undefined;
+  const isSingleRun = !hasPersistedTrialRuns(result);
+  const singleRunDir = path.posix.join(artifactSubdir, trialRunDirName(0));
 
   return {
     timestamp: result.timestamp,
@@ -1563,7 +1545,7 @@ export function buildResultIndexArtifact(
     start_time: result.startTime,
     end_time: result.endTime,
     scores: toIndexScores(result.scores),
-    trials: toTrialArtifacts(result.trials),
+    trials: toIndexTrialArtifacts(result),
     aggregation: toTrialAggregationArtifact(result.aggregation),
     execution_status: result.executionStatus,
     error: result.error,
@@ -1571,20 +1553,20 @@ export function buildResultIndexArtifact(
     failure_reason_code: result.failureReasonCode,
     workspace_path: result.workspacePath,
     artifact_dir: artifactSubdir,
-    task_dir: input ? path.posix.join(artifactSubdir, 'task') : undefined,
-    grading_path: path.posix.join(artifactSubdir, 'grading.json'),
-    timing_path: path.posix.join(artifactSubdir, 'timing.json'),
-    input_path: input ? path.posix.join(artifactSubdir, 'task', 'PROMPT.md') : undefined,
-    output_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
-    answer_path: hasAnswer ? path.posix.join(artifactSubdir, 'outputs', 'answer.md') : undefined,
-    trace_path: path.posix.join(artifactSubdir, CANONICAL_TRACE_ARTIFACT_PATH),
-    transcript_path: hasTranscript
-      ? path.posix.join(artifactSubdir, CANONICAL_TRANSCRIPT_ARTIFACT_PATH)
-      : undefined,
-    metrics_path: path.posix.join(artifactSubdir, CANONICAL_METRICS_ARTIFACT_PATH),
-    raw_provider_log_path: hasRawProviderLog
-      ? path.posix.join(artifactSubdir, 'provider.log')
+    summary_path: path.posix.join(artifactSubdir, RUN_SUMMARY_FILENAME),
+    grading_path: isSingleRun ? path.posix.join(singleRunDir, 'grading.json') : undefined,
+    timing_path: isSingleRun ? path.posix.join(singleRunDir, 'timing.json') : undefined,
+    metrics_path: isSingleRun
+      ? path.posix.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH)
       : undefined,
+    output_path:
+      isSingleRun && hasAnswer ? path.posix.join(singleRunDir, 'outputs', 'answer.md') : undefined,
+    answer_path:
+      isSingleRun && hasAnswer ? path.posix.join(singleRunDir, 'outputs', 'answer.md') : undefined,
+    transcript_path:
+      isSingleRun && hasTranscript
+        ? path.posix.join(singleRunDir, 'transcript-raw.jsonl')
+        : undefined,
     artifact_pointers: options?.artifactPointers,
     ...extraIndexFields,
     external_trace: toIndexExternalTrace(result, options?.projectionIdentity?.dimensions.runId),
@@ -1937,8 +1919,7 @@ export async function writeArtifacts(
   options?: { evalFile?: string; experiment?: string },
 ): Promise<{
   testArtifactDir: string;
-  timingPath: string;
-  benchmarkPath: string;
+  summaryPath: string;
   indexPath: string;
 }> {
   const content = await readFile(jsonlPath, 'utf8');
@@ -1983,39 +1964,10 @@ export async function writePerTestArtifacts(
   const indexRecords: ResultIndexArtifact[] = [];
 
   for (const result of results) {
-    const grading = buildGradingArtifact(result);
-    const timing = buildTimingArtifact([result]);
     const artifactSubdir = buildArtifactSubdir(result);
     const testDir = path.join(outputDir, artifactSubdir);
     await mkdir(testDir, { recursive: true });
-    await writeFile(
-      path.join(testDir, 'grading.json'),
-      `${JSON.stringify(grading, null, 2)}\n`,
-      'utf8',
-    );
-    await writeFile(
-      path.join(testDir, 'timing.json'),
-      `${JSON.stringify(timing, null, 2)}\n`,
-      'utf8',
-    );
-
-    const input = extractPrompt(result, testByTestId.get(result.testId ?? ''));
-    if (input) {
-      const promptPath = path.join(testDir, 'task', 'PROMPT.md');
-      await mkdir(path.dirname(promptPath), { recursive: true });
-      await writeFile(promptPath, input, 'utf8');
-    }
-    const outputsDir = path.join(testDir, 'outputs');
-    await mkdir(outputsDir, { recursive: true });
-    if (result.output.length > 0) {
-      await writeFile(path.join(outputsDir, 'answer.md'), result.output, 'utf8');
-    }
-    const rawProviderLogSource = rawProviderLogSourcePath(result);
-    if (rawProviderLogSource) {
-      await copyRawProviderLogArtifact(rawProviderLogSource, testDir);
-    }
-    const tracePath = path.join(testDir, CANONICAL_TRACE_ARTIFACT_PATH);
-    const envelope = await writeTraceEnvelopeSidecar({
+    const envelope = buildTraceEnvelopeSidecar({
       result,
       outputDir,
       testDir,
@@ -2024,24 +1976,43 @@ export async function writePerTestArtifacts(
       runId: options?.runId,
       duplicatePolicy,
     });
-    const transcriptPath = hasTranscriptProjection(result, envelope)
-      ? path.join(testDir, CANONICAL_TRANSCRIPT_ARTIFACT_PATH)
-      : undefined;
-    if (transcriptPath) {
-      await writeTranscriptJsonl(transcriptPath, result, envelope);
+    const projectionIdentity = envelope.projectionIdentity;
+    if (!projectionIdentity) {
+      throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`);
     }
-    const metricsPath = path.join(testDir, CANONICAL_METRICS_ARTIFACT_PATH);
-    await writeMetricsArtifact({
-      filePath: metricsPath,
-      result,
-      envelope,
-      transcriptPath,
-    });
-    const artifactPointers = await buildArtifactPointers({
-      outputDir,
-      tracePath,
-      transcriptPath,
-    });
+    const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME);
+    const aggregateTiming = buildRepeatAggregateTimingArtifact(result);
+    const summary = buildRepeatCaseSummaryArtifact(result, aggregateTiming, projectionIdentity.id);
+    await writeFile(caseSummaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
+
+    for (const trial of materializedRunTrials(result)) {
+      await writeTrialRunArtifacts({
+        trial,
+        parentTestDir: testDir,
+        outputDir,
+        evalFile: options?.evalFile,
+        experiment: options?.experiment,
+        runId: options?.runId,
+        duplicatePolicy,
+        testByTestId,
+      });
+    }
+
+    const isSingleRun = !hasPersistedTrialRuns(result);
+    const singleRunDir = path.join(testDir, trialRunDirName(0));
+    const singleAnswerPath =
+      isSingleRun && result.output.length > 0
+        ? path.join(singleRunDir, 'outputs', 'answer.md')
+        : undefined;
+    const singleTranscriptPath =
+      isSingleRun && hasTranscriptProjection(result, envelope)
+        ? path.join(singleRunDir, 'transcript-raw.jsonl')
+        : undefined;
+    const singleGradingPath = isSingleRun ? path.join(singleRunDir, 'grading.json') : undefined;
+    const singleTimingPath = isSingleRun ? path.join(singleRunDir, 'timing.json') : undefined;
+    const singleMetricsPath = isSingleRun
+      ? path.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH)
+      : undefined;
 
     const extraIndexFields = await collectAdditionalIndexFields(
       result,
@@ -2052,10 +2023,19 @@ export async function writePerTestArtifacts(
     );
 
     indexRecords.push({
-      ...buildResultIndexArtifact(result, extraIndexFields, {
-        projectionIdentity: envelope.projectionIdentity,
+      ...buildIndexArtifactEntry(result, {
+        outputDir,
+        artifactDir: testDir,
+        summaryPath: caseSummaryPath,
+        gradingPath: singleGradingPath,
+        timingPath: singleTimingPath,
+        metricsPath: singleMetricsPath,
+        outputPath: singleAnswerPath,
+        answerPath: singleAnswerPath,
+        transcriptPath: singleTranscriptPath,
+        extraIndexFields,
+        projectionIdentity,
         duplicatePolicy,
-        artifactPointers,
       }),
       experiment: options?.experiment,
     });
@@ -2079,13 +2059,11 @@ export async function writeArtifactsFromResults(
   },
 ): Promise<{
   testArtifactDir: string;
-  timingPath: string;
-  benchmarkPath: string;
+  summaryPath: string;
   indexPath: string;
 }> {
   const testArtifactDir = outputDir;
-  const timingPath = path.join(outputDir, 'timing.json');
-  const benchmarkPath = path.join(outputDir, 'benchmark.json');
+  const summaryPath = path.join(outputDir, RUN_SUMMARY_FILENAME);
   const indexPath = path.join(outputDir, RESULT_INDEX_FILENAME);
   await mkdir(outputDir, { recursive: true });
   const duplicatePolicy = options?.duplicatePolicy ?? 'update';
@@ -2096,16 +2074,9 @@ export async function writeArtifactsFromResults(
   const emittedIdentityIds = new Set<string>();
 
   const plans = results.map((result) => {
-    const grading = buildGradingArtifact(result);
-    const timing = buildTimingArtifact([result]);
     const artifactSubdir = buildArtifactSubdir(result);
     const testDir = path.join(outputDir, artifactSubdir);
-    const gradingPath = path.join(testDir, 'grading.json');
-    const perTestTimingPath = path.join(testDir, 'timing.json');
-    const input = extractPrompt(result, testByTestId.get(result.testId ?? ''));
-    const inputPath = input ? path.join(testDir, 'task', 'PROMPT.md') : undefined;
-    const outputsDir = path.join(testDir, 'outputs');
-    const answerPath = result.output.length > 0 ? path.join(outputsDir, 'answer.md') : undefined;
+    const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME);
     const envelope = buildTraceEnvelopeSidecar({
       result,
       outputDir,
@@ -2115,38 +2086,37 @@ export async function writeArtifactsFromResults(
       runId: options?.runId,
       duplicatePolicy,
     });
-    const transcriptPath = hasTranscriptProjection(result, envelope)
-      ? path.join(testDir, CANONICAL_TRANSCRIPT_ARTIFACT_PATH)
-      : undefined;
-    const tracePath = path.join(testDir, CANONICAL_TRACE_ARTIFACT_PATH);
-    const metricsPath = path.join(testDir, CANONICAL_METRICS_ARTIFACT_PATH);
-    const rawProviderLogSource = rawProviderLogSourcePath(result);
-    const rawProviderLogPath = rawProviderLogSource
-      ? rawProviderLogArtifactPath(testDir)
-      : undefined;
     const projectionIdentity = envelope.projectionIdentity;
     if (!projectionIdentity) {
       throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`);
     }
     const identityId = projectionIdentity.id;
+    const isSingleRun = !hasPersistedTrialRuns(result);
+    const singleRunDir = path.join(testDir, trialRunDirName(0));
+    const singleAnswerPath =
+      isSingleRun && result.output.length > 0
+        ? path.join(singleRunDir, 'outputs', 'answer.md')
+        : undefined;
+    const singleTranscriptPath =
+      isSingleRun && hasTranscriptProjection(result, envelope)
+        ? path.join(singleRunDir, 'transcript-raw.jsonl')
+        : undefined;
+    const singleGradingPath = isSingleRun ? path.join(singleRunDir, 'grading.json') : undefined;
+    const singleTimingPath = isSingleRun ? path.join(singleRunDir, 'timing.json') : undefined;
+    const singleMetricsPath = isSingleRun
+      ? path.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH)
+      : undefined;
     return {
       result,
-      grading,
-      timing,
       testDir,
-      gradingPath,
-      perTestTimingPath,
-      input,
-      inputPath,
-      outputsDir,
-      answerPath,
-      tracePath,
-      metricsPath,
-      envelope,
+      caseSummaryPath,
       projectionIdentity,
-      transcriptPath,
-      rawProviderLogSource,
-      rawProviderLogPath,
+      isSingleRun,
+      singleAnswerPath,
+      singleTranscriptPath,
+      singleGradingPath,
+      singleTimingPath,
+      singleMetricsPath,
       identityId,
     };
   });
@@ -2168,7 +2138,7 @@ export async function writeArtifactsFromResults(
   }
 
   for (const plan of plans) {
-    const { result, envelope, identityId } = plan;
+    const { result, identityId } = plan;
     const existing = existingByIdentity.get(identityId);
     if (duplicatePolicy === 'skip' && existing) {
       indexRecords.push(skippedExistingRecord(existing, plan.projectionIdentity, duplicatePolicy));
@@ -2181,92 +2151,14 @@ export async function writeArtifactsFromResults(
 
     await mkdir(plan.testDir, { recursive: true });
 
-    if (hasPersistedTrialRuns(result)) {
-      const aggregateTiming = buildRepeatAggregateTimingArtifact(result);
-      const summaryPath = path.join(plan.testDir, 'summary.json');
-      const summary = buildRepeatCaseSummaryArtifact(
-        result,
-        aggregateTiming,
-        options?.experimentMetadata?.fingerprint ?? plan.projectionIdentity.id,
-      );
-      await writeFile(plan.gradingPath, `${JSON.stringify(plan.grading, null, 2)}\n`, 'utf8');
-      await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
-      if (plan.inputPath && plan.input) {
-        await mkdir(path.dirname(plan.inputPath), { recursive: true });
-        await writeFile(plan.inputPath, plan.input, 'utf8');
-      }
-      for (const trial of result.trials ?? []) {
-        await writeTrialRunArtifacts({
-          trial,
-          parentTestDir: plan.testDir,
-          outputDir,
-          evalFile: options?.evalFile,
-          experiment: options?.experiment,
-          runId: options?.runId,
-          duplicatePolicy,
-          testByTestId,
-        });
-      }
-
-      const nextRecord = {
-        ...buildIndexArtifactEntry(result, {
-          outputDir,
-          artifactDir: plan.testDir,
-          gradingPath: plan.gradingPath,
-          summaryPath,
-          inputPath: plan.inputPath,
-          extraIndexFields: plan.inputPath
-            ? { task_dir: toRelativeArtifactPath(outputDir, path.dirname(plan.inputPath)) }
-            : undefined,
-          projectionIdentity: plan.projectionIdentity,
-          duplicatePolicy,
-        }),
-        experiment: options?.experiment,
-      };
-      if (duplicatePolicy === 'update' && emittedIdentityIds.has(identityId)) {
-        const existingIndex = indexRecords.findIndex(
-          (record) => projectionIdentityRecordKey(record) === identityId,
-        );
-        if (existingIndex >= 0) {
-          indexRecords[existingIndex] = nextRecord;
-        }
-      } else {
-        indexRecords.push(nextRecord);
-      }
-      emittedIdentityIds.add(identityId);
-      continue;
-    }
-
-    await writeFile(plan.gradingPath, `${JSON.stringify(plan.grading, null, 2)}\n`, 'utf8');
-    await writeFile(plan.perTestTimingPath, `${JSON.stringify(plan.timing, null, 2)}\n`, 'utf8');
-
-    if (plan.inputPath && plan.input) {
-      await mkdir(path.dirname(plan.inputPath), { recursive: true });
-      await writeFile(plan.inputPath, plan.input, 'utf8');
-    }
-
-    await mkdir(plan.outputsDir, { recursive: true });
-    if (plan.answerPath) {
-      await writeFile(plan.answerPath, result.output, 'utf8');
-    }
-    if (plan.rawProviderLogSource) {
-      await copyRawProviderLogArtifact(plan.rawProviderLogSource, plan.testDir);
-    }
-    await writeFile(
-      plan.tracePath,
-      `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}\n`,
-      'utf8',
-    );
-    if (plan.transcriptPath) {
-      await writeTranscriptJsonl(plan.transcriptPath, result, envelope);
-    }
-    await writeMetricsArtifact({
-      filePath: plan.metricsPath,
+    const aggregateTiming = buildRepeatAggregateTimingArtifact(result);
+    const summary = buildRepeatCaseSummaryArtifact(
       result,
-      envelope,
-      transcriptPath: plan.transcriptPath,
-    });
-    for (const trial of result.trials ?? []) {
+      aggregateTiming,
+      options?.experimentMetadata?.fingerprint ?? plan.projectionIdentity.id,
+    );
+    await writeFile(plan.caseSummaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
+    for (const trial of materializedRunTrials(result)) {
       await writeTrialRunArtifacts({
         trial,
         parentTestDir: plan.testDir,
@@ -2278,11 +2170,6 @@ export async function writeArtifactsFromResults(
         testByTestId,
       });
     }
-    const artifactPointers = await buildArtifactPointers({
-      outputDir,
-      tracePath: plan.tracePath,
-      transcriptPath: plan.transcriptPath,
-    });
 
     const extraIndexFields = await collectAdditionalIndexFields(
       result,
@@ -2292,28 +2179,18 @@ export async function writeArtifactsFromResults(
       options?.additionalArtifacts,
     );
 
-    const indexExtraFields = {
-      ...(plan.inputPath
-        ? { task_dir: toRelativeArtifactPath(outputDir, path.dirname(plan.inputPath)) }
-        : {}),
-      ...extraIndexFields,
-    };
-
     const nextRecord = {
       ...buildIndexArtifactEntry(result, {
         outputDir,
         artifactDir: plan.testDir,
-        gradingPath: plan.gradingPath,
-        timingPath: plan.perTestTimingPath,
-        outputPath: plan.answerPath,
-        answerPath: plan.answerPath,
-        tracePath: plan.tracePath,
-        transcriptPath: plan.transcriptPath,
-        metricsPath: plan.metricsPath,
-        artifactPointers,
-        rawProviderLogPath: plan.rawProviderLogPath,
-        inputPath: plan.inputPath,
-        extraIndexFields: indexExtraFields,
+        summaryPath: plan.caseSummaryPath,
+        gradingPath: plan.singleGradingPath,
+        timingPath: plan.singleTimingPath,
+        metricsPath: plan.singleMetricsPath,
+        outputPath: plan.singleAnswerPath,
+        answerPath: plan.singleAnswerPath,
+        transcriptPath: plan.singleTranscriptPath,
+        extraIndexFields,
         projectionIdentity: plan.projectionIdentity,
         duplicatePolicy,
       }),
@@ -2332,20 +2209,17 @@ export async function writeArtifactsFromResults(
     emittedIdentityIds.add(identityId);
   }
 
-  const timing = buildTimingArtifact(results);
-  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
-
-  const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath));
-  const benchmark = buildBenchmarkArtifact(
+  const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(summaryPath));
+  const summary = buildRunSummaryArtifact(
     results,
     options?.evalFile,
     options?.experiment,
     plannedTestCount,
     options?.experimentMetadata,
   );
-  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
+  await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
 
   await writeJsonlFile(indexPath, indexRecords);
 
-  return { testArtifactDir, timingPath, benchmarkPath, indexPath };
+  return { testArtifactDir, summaryPath, indexPath };
 }
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index bd41ac4bf..47fef0767 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -61,9 +61,10 @@ export {
 } from './evaluation/evaluate.js';
 export {
   RESULT_INDEX_FILENAME,
+  RUN_SUMMARY_FILENAME,
   aggregateRunDir,
   buildAggregateGradingArtifact,
-  buildBenchmarkArtifact,
+  buildRunSummaryArtifact,
   buildGradingArtifact,
   buildIndexArtifactEntry,
   buildResultIndexArtifact,
@@ -73,16 +74,16 @@ export {
   parseJsonlResults,
   writeArtifacts,
   writeArtifactsFromResults,
-  writeInitialBenchmarkArtifact,
+  writeInitialRunSummaryArtifact,
   writePerTestArtifacts,
   type AdditionalResultArtifactsContext,
   type AdditionalResultArtifactsWriter,
   type AdditionalResultIndexFields,
   type AggregateGradingArtifact,
-  type BenchmarkArtifact,
   type GradingArtifact,
   type IndexArtifactEntry,
   type ResultIndexArtifact,
+  type RunSummaryArtifact,
   type TimingArtifact,
 } from './evaluation/run-artifacts.js';
 export type {
diff --git a/packages/core/src/runtime/exec.ts b/packages/core/src/runtime/exec.ts
index 75216dfc8..8099dd088 100644
--- a/packages/core/src/runtime/exec.ts
+++ b/packages/core/src/runtime/exec.ts
@@ -32,71 +32,12 @@ export async function execFileWithStdin(
     throw new Error('Executable argv must include at least one entry');
   }
 
-  // Use Bun.spawn if available, otherwise fall back to Node.js child_process
-  if (typeof Bun !== 'undefined') {
-    return execFileWithStdinBun(argv, stdinPayload, options);
-  }
+  // Use Node's child_process path even under Bun. Bun 1.3.3 can hang when a
+  // Bun.spawn child is a Node process reading stdin, which breaks code grader
+  // and workspace hook execution on CI's pinned Bun version.
   return execFileWithStdinNode(argv, stdinPayload, options);
 }
 
-/**
- * Bun implementation using Bun.spawn
- */
-async function execFileWithStdinBun(
-  argv: readonly string[],
-  stdinPayload: string,
-  options: ExecOptions,
-): Promise<{
-  readonly stdout: string;
-  readonly stderr: string;
-  readonly exitCode: number;
-}> {
-  const command = [...argv];
-  const encoder = new TextEncoder();
-  const proc = Bun.spawn(command, {
-    cwd: options.cwd,
-    stdin: encoder.encode(stdinPayload),
-    stdout: 'pipe',
-    stderr: 'pipe',
-    // Merge additional env vars with process.env
-    env: options.env ? { ...process.env, ...options.env } : process.env,
-  });
-
-  let timedOut = false;
-  const timeout =
-    options.timeoutMs !== undefined
-      ? setTimeout(() => {
-          timedOut = true;
-          proc.kill('SIGKILL');
-        }, options.timeoutMs)
-      : undefined;
-
-  try {
-    const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve('');
-    const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve('');
-
-    const [stdout, stderr, exitCode] = await Promise.all([
-      stdoutPromise,
-      stderrPromise,
-      proc.exited,
-    ]);
-
-    if (timedOut) {
-      throw new Error(`Process timed out after ${options.timeoutMs}ms`);
-    }
-
-    return {
-      stdout: stdout.replace(/\r\n/g, '\n'),
-      stderr: stderr.replace(/\r\n/g, '\n'),
-      exitCode,
-    };
-  } finally {
-    if (timeout !== undefined) {
-      clearTimeout(timeout);
-    }
-  }
-}
-
 /**
  * Node.js implementation using child_process.spawn
  */
diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts
index a69a4478d..b65502f20 100644
--- a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts
+++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts
@@ -132,8 +132,7 @@ describe('evaluate() — programmatic API extensions', () => {
         expect(result.artifacts).toBeDefined();
         expect(result.artifacts?.runDir).toBe(outputDir);
         expect(result.artifacts?.indexPath).toBe(path.join(outputDir, 'index.jsonl'));
-        expect(result.artifacts?.benchmarkPath).toBe(path.join(outputDir, 'benchmark.json'));
-        expect(result.artifacts?.timingPath).toBe(path.join(outputDir, 'timing.json'));
+        expect(result.artifacts?.summaryPath).toBe(path.join(outputDir, 'summary.json'));
 
         const indexContent = await readFile(path.join(outputDir, 'index.jsonl'), 'utf8');
         expect(indexContent).toContain('"test_id":"programmatic-artifacts"');
@@ -143,17 +142,27 @@ describe('evaluate() — programmatic API extensions', () => {
           .split('\n')
           .map((line) => JSON.parse(line) as { artifact_dir?: string });
 
-        const benchmark = JSON.parse(
-          await readFile(path.join(outputDir, 'benchmark.json'), 'utf8'),
-        ) as { metadata: { experiment?: string; tests_run: string[]; eval_file: string } };
-        expect(benchmark.metadata.experiment).toBe('sdk-test');
-        expect(benchmark.metadata.tests_run).toEqual(['programmatic-artifacts']);
-        expect(benchmark.metadata.eval_file).toBe('');
+        const summaryArtifact = JSON.parse(
+          await readFile(path.join(outputDir, 'summary.json'), 'utf8'),
+        ) as {
+          metadata: { experiment?: string; tests_run: string[]; eval_file: string };
+          timing: { duration_ms: number };
+        };
+        expect(summaryArtifact.metadata.experiment).toBe('sdk-test');
+        expect(summaryArtifact.metadata.tests_run).toEqual(['programmatic-artifacts']);
+        expect(summaryArtifact.metadata.eval_file).toBe('');
+        expect(summaryArtifact.timing.duration_ms).toBeGreaterThanOrEqual(0);
 
         expect(indexRow?.artifact_dir).toBe('__programmatic__.yaml/programmatic-artifacts');
         expect(
           existsSync(
-            path.join(outputDir, '__programmatic__.yaml', 'programmatic-artifacts', 'grading.json'),
+            path.join(
+              outputDir,
+              '__programmatic__.yaml',
+              'programmatic-artifacts',
+              'run-1',
+              'grading.json',
+            ),
           ),
         ).toBe(true);
         expect(
@@ -162,6 +171,7 @@ describe('evaluate() — programmatic API extensions', () => {
               outputDir,
               '__programmatic__.yaml',
               'programmatic-artifacts',
+              'run-1',
               'outputs',
               'answer.md',
             ),
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index bfb23d02f..05355d12c 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -723,7 +723,7 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
     expect(result.failureReasonCode).toBe('provider_error');
   });
 
-  it('copies and indexes raw provider logs from normal per-case evaluation artifacts', async () => {
+  it('copies raw provider logs from normal per-case evaluation artifacts', async () => {
     const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-'));
     const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl');
     writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8');
@@ -750,11 +750,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
     await writeArtifactsFromResults([result], outputDir);
 
     const artifactDir = path.join(outputDir, 'test-dataset', 'case-1');
-    const outputsDir = path.join(artifactDir, 'outputs');
-    expect(readFileSync(path.join(artifactDir, 'provider.log'), 'utf8')).toBe(
+    const runDir = path.join(artifactDir, 'run-1');
+    const outputsDir = path.join(runDir, 'outputs');
+    expect(readFileSync(path.join(runDir, 'provider.log'), 'utf8')).toBe(
       '{"event":"provider-native"}\n',
     );
-    expect(readdirSync(artifactDir)).toContain('transcript.jsonl');
+    expect(readdirSync(runDir)).toContain('transcript-raw.jsonl');
+    expect(readdirSync(runDir)).toContain('transcript.json');
     expect(readdirSync(outputsDir)).not.toContain('transcript.jsonl');
     expect(readdirSync(outputsDir)).not.toContain('transcript.json');
 
@@ -762,9 +764,9 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
       .trim()
       .split('\n')
       .map((line) => JSON.parse(line) as Record<string, unknown>);
-    expect(indexRows[0]?.raw_provider_log_path).toBe('test-dataset/case-1/provider.log');
-    expect(indexRows[0]?.trace_path).toBe('test-dataset/case-1/trace.json');
-    expect(indexRows[0]?.transcript_path).toBe('test-dataset/case-1/transcript.jsonl');
+    expect(indexRows[0]?.raw_provider_log_path).toBeUndefined();
+    expect(indexRows[0]?.trace_path).toBeUndefined();
+    expect(indexRows[0]?.transcript_path).toBe('test-dataset/case-1/run-1/transcript-raw.jsonl');
   });
 
   it('reports failed progress status for batch item errors', async () => {
diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts
index 2ccf92d4e..5e7ac501f 100644
--- a/packages/core/test/evaluation/results-repo.test.ts
+++ b/packages/core/test/evaluation/results-repo.test.ts
@@ -241,7 +241,7 @@ function writeRunArtifacts(runDir: string, experiment: string, timestamp: string
   mkdirSync(runDir, { recursive: true });
   writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"alpha"}\n');
   writeFileSync(
-    path.join(runDir, 'benchmark.json'),
+    path.join(runDir, 'summary.json'),
     JSON.stringify(
       {
         metadata: {
@@ -395,11 +395,26 @@ describe('listGitRuns', () => {
     rmSync(repoDir, { recursive: true, force: true });
   });
 
-  it('returns committed runs derived from benchmark.json blobs', async () => {
+  it('returns committed runs derived from index.jsonl manifests', async () => {
     const defaultRunDir = path.join(repoDir, 'runs', 'default', '2026-05-20T10-00-00-000Z');
     mkdirSync(defaultRunDir, { recursive: true });
     writeFileSync(
-      path.join(defaultRunDir, 'benchmark.json'),
+      path.join(defaultRunDir, 'index.jsonl'),
+      `${[
+        JSON.stringify({
+          test_id: 'alpha',
+          target: 'gpt-4o',
+          timestamp: '2026-05-20T10:00:00.000Z',
+        }),
+        JSON.stringify({
+          test_id: 'beta',
+          target: 'gpt-4o',
+          timestamp: '2026-05-20T10:00:00.000Z',
+        }),
+      ].join('\n')}\n`,
+    );
+    writeFileSync(
+      path.join(defaultRunDir, 'summary.json'),
       JSON.stringify(
         {
           metadata: {
@@ -421,7 +436,27 @@ describe('listGitRuns', () => {
     const experimentRunDir = path.join(repoDir, 'runs', 'with-skills', '2026-05-21T11-00-00-000Z');
     mkdirSync(experimentRunDir, { recursive: true });
     writeFileSync(
-      path.join(experimentRunDir, 'benchmark.json'),
+      path.join(experimentRunDir, 'index.jsonl'),
+      `${[
+        JSON.stringify({
+          test_id: 'alpha',
+          target: 'claude-sonnet',
+          timestamp: '2026-05-21T11:00:00.000Z',
+        }),
+        JSON.stringify({
+          test_id: 'beta',
+          target: 'gpt-4o',
+          timestamp: '2026-05-21T11:00:00.000Z',
+        }),
+        JSON.stringify({
+          test_id: 'gamma',
+          target: 'gpt-4o',
+          timestamp: '2026-05-21T11:00:00.000Z',
+        }),
+      ].join('\n')}\n`,
+    );
+    writeFileSync(
+      path.join(experimentRunDir, 'summary.json'),
       JSON.stringify(
         {
           metadata: {
@@ -459,7 +494,7 @@ describe('listGitRuns', () => {
       timestamp: '2026-05-21T11:00:00.000Z',
       display_name: 'remote friendly run',
       manifest_path: 'runs/with-skills/2026-05-21T11-00-00-000Z/index.jsonl',
-      benchmark_path: 'runs/with-skills/2026-05-21T11-00-00-000Z/benchmark.json',
+      summary_path: 'runs/with-skills/2026-05-21T11-00-00-000Z/summary.json',
       test_count: 3,
       pass_rate: 0.75,
       avg_score: 0,
@@ -494,7 +529,15 @@ describe('listGitRuns', () => {
     const runDir = path.join(repoDir, 'runs', 'default', '2026-05-20T10-00-00-000Z');
     mkdirSync(runDir, { recursive: true });
     writeFileSync(
-      path.join(runDir, 'benchmark.json'),
+      path.join(runDir, 'index.jsonl'),
+      `${JSON.stringify({
+        test_id: 'alpha',
+        target: 'gpt-4o',
+        timestamp: '2026-05-20T10:00:00.000Z',
+      })}\n`,
+    );
+    writeFileSync(
+      path.join(runDir, 'summary.json'),
       JSON.stringify(
         {
           metadata: {
@@ -543,7 +586,7 @@ describe('listGitRuns', () => {
     mkdirSync(path.join(runDir, 'attachments'), { recursive: true });
     writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"alpha"}\n');
     writeFileSync(
-      path.join(runDir, 'benchmark.json'),
+      path.join(runDir, 'summary.json'),
       JSON.stringify({
         metadata: {
           timestamp: '2026-05-22T10:00:00.000Z',
@@ -666,7 +709,7 @@ describe('results repo write path', () => {
     expect(normalized.remote).toBe('origin');
   });
 
-  it('normalizes URL-backed source storage branch config without requiring a local remote name', () => {
+  it('normalizes URL-backed source storage branch config to the existing origin remote', () => {
     const normalized = normalizeResultsConfig(
       {
         repo_url: 'https://github.com/example/source.git',
@@ -682,10 +725,36 @@ describe('results repo write path', () => {
     expect(normalized.repo_path).toBeUndefined();
     expect(normalized.path).toBe('/tmp/source-project');
     expect(normalized.branch).toBe(DEFAULT_RESULTS_BRANCH);
-    expect(normalized.remote).toBe('agentv-results');
+    expect(normalized.remote).toBe('origin');
     expect(normalized.auto_push).toBe(true);
   });
 
+  it('does not mutate existing checkout remotes while checking sync status', async () => {
+    const sourceRootDir = path.join(rootDir, 'status-source-remote');
+    const configuredResultsRootDir = path.join(rootDir, 'status-configured-results-remote');
+    mkdirSync(sourceRootDir, { recursive: true });
+    mkdirSync(configuredResultsRootDir, { recursive: true });
+    const { remoteDir: sourceRemoteDir } = initializeRemoteRepo(sourceRootDir);
+    const { remoteDir: configuredResultsRemoteDir } =
+      initializeRemoteRepo(configuredResultsRootDir);
+    const projectDir = path.join(rootDir, 'source-project-status-remote-immutable');
+    git(`git clone --quiet "${sourceRemoteDir}" "${projectDir}"`, rootDir);
+    git('git config user.email "test@example.com"', projectDir);
+    git('git config user.name "Test User"', projectDir);
+
+    const originalOrigin = git('git remote get-url origin', projectDir);
+
+    await getResultsRepoSyncStatus({
+      repo_url: `file://${configuredResultsRemoteDir}`,
+      path: projectDir,
+      branch: DEFAULT_RESULTS_BRANCH,
+      sync: { auto_push: true },
+    });
+
+    expect(git('git remote get-url origin', projectDir)).toBe(originalOrigin);
+    expect(git('git remote get-url agentv-results || true', projectDir)).toBe('');
+  });
+
   it('publishes current-repo results to an auto-created branch without switching source checkout', async () => {
     const projectDir = path.join(rootDir, 'source-project');
     mkdirSync(projectDir, { recursive: true });
@@ -721,7 +790,7 @@ describe('results repo write path', () => {
     expect(published).toBe(true);
     expect(git('git branch --show-current', projectDir)).toBe('main');
     const branchFiles = git(`git ls-tree -r --name-only ${DEFAULT_RESULTS_BRANCH}`, projectDir);
-    expect(branchFiles).toContain(`runs/current-repo/${runTimestamp}/benchmark.json`);
+    expect(branchFiles).toContain(`runs/current-repo/${runTimestamp}/summary.json`);
     expect(branchFiles).not.toContain('README.md');
     expect(branchFiles).not.toContain('UNRELATED.txt');
     expect(git('git status --short --branch', projectDir)).toContain('## main');
@@ -878,15 +947,17 @@ describe('results repo write path', () => {
     );
   }, 20000);
 
-  it('publishes URL-backed source results through a managed remote alias', async () => {
+  it('publishes URL-backed source results through the checkout origin without mutating remotes', async () => {
     const { remoteDir } = initializeRemoteRepo(rootDir);
     const projectDir = path.join(rootDir, 'source-project-url-path');
     mkdirSync(projectDir, { recursive: true });
     git('git init --initial-branch=main --quiet', projectDir);
     git('git config user.email "test@example.com"', projectDir);
     git('git config user.name "Test User"', projectDir);
+    git(`git remote add origin "file://${remoteDir}"`, projectDir);
     writeFileSync(path.join(projectDir, 'README.md'), '# source project\n');
     git('git add README.md && git commit --quiet -m "seed source"', projectDir);
+    const originalOrigin = git('git remote get-url origin', projectDir);
 
     const runTimestamp = '2026-06-22T04-00-00-000Z';
     const runDir = path.join(
@@ -912,13 +983,14 @@ describe('results repo write path', () => {
     });
 
     expect(published).toBe(true);
-    expect(git('git remote get-url agentv-results', projectDir)).toBe(`file://${remoteDir}`);
+    expect(git('git remote get-url origin', projectDir)).toBe(originalOrigin);
+    expect(git('git remote get-url agentv-results || true', projectDir)).toBe('');
     expect(git('git branch --show-current', projectDir)).toBe('main');
     const remoteFiles = git(
       `git --git-dir "${remoteDir}" ls-tree -r --name-only ${DEFAULT_RESULTS_BRANCH}`,
       rootDir,
     );
-    expect(remoteFiles).toContain(`runs/url-backed-source/${runTimestamp}/benchmark.json`);
+    expect(remoteFiles).toContain(`runs/url-backed-source/${runTimestamp}/summary.json`);
     expect(remoteFiles).not.toContain('README.md');
   }, 20000);
 
@@ -1252,7 +1324,7 @@ describe('results repo write path', () => {
     ).rejects.toThrow(/simulated interrupted push/);
     expect(git('git rev-list --count origin/main..main', cloneDir)).toBe('1');
     expect(git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir)).not.toContain(
-      `runs/retry/${runTimestamp}/benchmark.json`,
+      `runs/retry/${runTimestamp}/summary.json`,
     );
 
     rmSync(hookPath, { force: true });
@@ -1268,7 +1340,7 @@ describe('results repo write path', () => {
 
     expect(git('git rev-list --count origin/main..main', cloneDir)).toBe('0');
     expect(git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir)).toContain(
-      `runs/retry/${runTimestamp}/benchmark.json`,
+      `runs/retry/${runTimestamp}/summary.json`,
     );
     expect(git(`git --git-dir "${remoteDir}" log -1 --pretty=%B main`, rootDir)).toContain(
       `AgentV-Run: retry::${runTimestamp}`,
@@ -1320,8 +1392,8 @@ describe('results repo write path', () => {
       `git --git-dir "${remoteDir}" ls-tree -r --name-only ${storageBranch}`,
       rootDir,
     );
-    expect(remoteFiles).toContain(`runs/${fixture.localDestinationPath}/benchmark.json`);
-    expect(remoteFiles).toContain('runs/remote-only/2026-06-23T09-30-00-000Z/benchmark.json');
+    expect(remoteFiles).toContain(`runs/${fixture.localDestinationPath}/summary.json`);
+    expect(remoteFiles).toContain('runs/remote-only/2026-06-23T09-30-00-000Z/summary.json');
     // No backup ref was ever created in the merge path.
     expect(
       git(`git --git-dir "${remoteDir}" for-each-ref refs/heads/agentv/backups`, rootDir),
@@ -1418,7 +1490,7 @@ describe('results repo write path', () => {
       rootDir,
     );
     expect(remoteFiles).toContain('RACE.md');
-    expect(remoteFiles).toContain('runs/race-local/2026-06-24T10-00-00-000Z/benchmark.json');
+    expect(remoteFiles).toContain('runs/race-local/2026-06-24T10-00-00-000Z/summary.json');
     expect(
       git(`git --git-dir "${remoteDir}" for-each-ref refs/heads/agentv/backups`, rootDir),
     ).toBe('');
@@ -1488,11 +1560,11 @@ describe('results repo write path', () => {
     expect(pushed).toBe(true);
     expect(git('git branch --show-current', cloneDir)).toBe('main');
     expect(git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir)).not.toContain(
-      `runs/branch-storage/${runTimestamp}/benchmark.json`,
+      `runs/branch-storage/${runTimestamp}/summary.json`,
     );
     expect(
       git(`git --git-dir "${remoteDir}" ls-tree -r --name-only ${storageBranch}`, rootDir),
-    ).toContain(`runs/branch-storage/${runTimestamp}/benchmark.json`);
+    ).toContain(`runs/branch-storage/${runTimestamp}/summary.json`);
     expect(
       git(`git --git-dir "${remoteDir}" log -1 --pretty=%B ${storageBranch}`, rootDir),
     ).toContain(`AgentV-Run: branch-storage::${runTimestamp}`);
@@ -1525,7 +1597,7 @@ describe('results repo write path', () => {
       rootDir,
     );
     expect(resultTree).toContain(`runs/${destinationPath}/index.jsonl`);
-    expect(resultTree).toContain(`runs/${destinationPath}/benchmark.json`);
+    expect(resultTree).toContain(`runs/${destinationPath}/summary.json`);
     expect(resultTree).not.toContain(`runs/${destinationPath}/alpha/trace.json`);
     expect(resultTree).not.toContain(`runs/${destinationPath}/alpha/transcript.jsonl`);
 
@@ -1535,7 +1607,7 @@ describe('results repo write path', () => {
     );
     expect(artifactTree).toContain(`runs/${destinationPath}/alpha/trace.json`);
     expect(artifactTree).toContain(`runs/${destinationPath}/alpha/transcript.jsonl`);
-    expect(artifactTree).not.toContain(`runs/${destinationPath}/benchmark.json`);
+    expect(artifactTree).not.toContain(`runs/${destinationPath}/summary.json`);
     expect(artifactTree).not.toContain(`runs/${destinationPath}/index.jsonl`);
 
     const index = JSON.parse(
@@ -1626,7 +1698,7 @@ describe('results repo write path', () => {
       rootDir,
     );
     expect(resultTree).toContain(`runs/${destinationPath}/index.jsonl`);
-    expect(resultTree).toContain(`runs/${destinationPath}/benchmark.json`);
+    expect(resultTree).toContain(`runs/${destinationPath}/summary.json`);
     expect(resultTree).not.toContain(`runs/${destinationPath}/alpha/trace.json`);
     expect(resultTree).not.toContain(`runs/${destinationPath}/alpha/transcript.jsonl`);
     const artifactTree = git(
@@ -1669,7 +1741,7 @@ describe('results repo write path', () => {
     );
     expect(
       git(`git --git-dir "${remoteDir}" ls-tree -r --name-only agentv-results`, rootDir),
-    ).toContain('runs/missing-branch/2026-06-12T11-00-00-000Z/benchmark.json');
+    ).toContain('runs/missing-branch/2026-06-12T11-00-00-000Z/summary.json');
   }, 20000);
 
   it('syncResultsRepo refreshes refs without checking out the base branch', async () => {
@@ -1778,7 +1850,7 @@ describe('results repo write path', () => {
       commit_created: false,
       blocked: false,
       branch: storageBranch,
-      upstream: `agentv-results/${storageBranch}`,
+      upstream: `origin/${storageBranch}`,
     });
     expect(status.auto_merged_remote).toBeUndefined();
     expect(git(`git show ${storageBranch}:REMOTE_BRANCH.md`, cloneDir)).toBe(
@@ -1812,7 +1884,7 @@ describe('results repo write path', () => {
       blocked: false,
     });
     expect(git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir)).toContain(
-      `runs/metadata/${runTimestamp}/benchmark.json`,
+      `runs/metadata/${runTimestamp}/summary.json`,
     );
   }, 20000);
 
@@ -1863,7 +1935,7 @@ describe('results repo write path', () => {
     });
     expect(status.dirty_paths).toEqual([]);
     expect(git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir)).toContain(
-      `runs/safe-run/${runTimestamp}/benchmark.json`,
+      `runs/safe-run/${runTimestamp}/summary.json`,
     );
     expect(git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir)).not.toContain(
       'package.json',
@@ -1897,7 +1969,7 @@ describe('results repo write path', () => {
       blocked: false,
     });
     const remoteFiles = git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir);
-    expect(remoteFiles).toContain(`runs/staged-unrelated/${runTimestamp}/benchmark.json`);
+    expect(remoteFiles).toContain(`runs/staged-unrelated/${runTimestamp}/summary.json`);
     expect(remoteFiles).not.toContain('package.json');
     expect(git('git status --porcelain', cloneDir)).toContain('A  package.json');
   }, 20000);
@@ -1957,7 +2029,7 @@ describe('results repo write path', () => {
     });
     const remoteFiles = git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir);
     expect(remoteFiles).toContain('REMOTE.md');
-    expect(remoteFiles).toContain(`runs/pulled-then-pushed/${runTimestamp}/benchmark.json`);
+    expect(remoteFiles).toContain(`runs/pulled-then-pushed/${runTimestamp}/summary.json`);
     expect(remoteFiles).not.toContain('package.json');
     expect(readFileSync(path.join(cloneDir, 'package.json'), 'utf8')).toBe(
       '{"dependencies":{"agentv":"next"}}\n',
@@ -2002,8 +2074,8 @@ describe('results repo write path', () => {
       ),
     ).not.toThrow();
     const remoteFiles = git(`git --git-dir "${remoteDir}" ls-tree -r --name-only main`, rootDir);
-    expect(remoteFiles).toContain('runs/local-only/2026-05-25T10-00-00-000Z/benchmark.json');
-    expect(remoteFiles).toContain('runs/remote-only/2026-05-25T11-00-00-000Z/benchmark.json');
+    expect(remoteFiles).toContain('runs/local-only/2026-05-25T10-00-00-000Z/summary.json');
+    expect(remoteFiles).toContain('runs/remote-only/2026-05-25T11-00-00-000Z/summary.json');
     expect(
       git(`git --git-dir "${remoteDir}" for-each-ref refs/heads/agentv/backups`, rootDir),
     ).toBe('');
@@ -2430,7 +2502,7 @@ describe('results branch stable genesis', () => {
     expect(isAncestor(remoteDir, mainSha, DEFAULT_RESULTS_BRANCH)).toBe(false);
     expect(
       git(`git --git-dir "${remoteDir}" ls-tree -r --name-only ${DEFAULT_RESULTS_BRANCH}`, rootDir),
-    ).toContain('runs/expA/2026-06-19T10-00-00-000Z/benchmark.json');
+    ).toContain('runs/expA/2026-06-19T10-00-00-000Z/summary.json');
   }, 20000);
 
   it('mints a byte-identical genesis root regardless of wall-clock time', async () => {
@@ -2490,8 +2562,8 @@ describe('results branch stable genesis', () => {
       `git --git-dir "${remoteDir}" ls-tree -r --name-only ${DEFAULT_RESULTS_BRANCH}`,
       rootDir,
     );
-    expect(tree).toContain('runs/expA/2026-06-19T10-00-00-000Z/benchmark.json');
-    expect(tree).toContain('runs/expB/2026-06-19T11-00-00-000Z/benchmark.json');
+    expect(tree).toContain('runs/expA/2026-06-19T10-00-00-000Z/summary.json');
+    expect(tree).toContain('runs/expB/2026-06-19T11-00-00-000Z/summary.json');
   }, 30000);
 
   it('reconciles two independent first-inits onto a single shared genesis', async () => {
@@ -2532,8 +2604,8 @@ describe('results branch stable genesis', () => {
       `git --git-dir "${remoteDir}" ls-tree -r --name-only ${DEFAULT_RESULTS_BRANCH}`,
       rootDir,
     );
-    expect(tree).toContain('runs/expA/2026-06-19T10-00-00-000Z/benchmark.json');
-    expect(tree).toContain('runs/expB/2026-06-19T11-00-00-000Z/benchmark.json');
+    expect(tree).toContain('runs/expA/2026-06-19T10-00-00-000Z/summary.json');
+    expect(tree).toContain('runs/expB/2026-06-19T11-00-00-000Z/summary.json');
   }, 30000);
 });
 
diff --git a/skills-data/agentv-bench/SKILL.md b/skills-data/agentv-bench/SKILL.md
index 1efa305e4..44dff85e9 100644
--- a/skills-data/agentv-bench/SKILL.md
+++ b/skills-data/agentv-bench/SKILL.md
@@ -242,7 +242,7 @@ agentv pipeline bench <run-dir>
 agentv results validate <run-dir>
 ```
 
-`pipeline bench` reads LLM grader results from `llm_grader_results/<name>.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`.
+`pipeline bench` reads LLM grader results from `llm_grader_results/<name>.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `summary.json`.
 
 > **Diagnosing `pass_rate=0`:** If `pipeline bench` reports `pass_rate=0` across the board, do **not** assume the tests genuinely failed. First verify the grading pipeline ran correctly: check that `<test-id>/llm_grader_results/<name>.json` exists and is non-empty for each test. If these files are absent or empty, the grader subagents failed to produce output (most common cause: `agents/grader.md` was not embedded in the subagent prompts — see Phase 2). Treat `pass_rate=0` as a real signal only after confirming grader results exist.
 
@@ -251,7 +251,7 @@ agentv results validate <run-dir>
 All artifacts use established schemas — see `references/schemas.md` for the full definitions. Do not modify the structure. Key artifacts per run:
 - **grading.json**: per-test assertions with `{text, passed, evidence}`, plus summary
 - **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}`
-- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}`
+- **summary.json**: per-target aggregate `{pass_rate, time_seconds, tokens}`
 
 Write artifacts to `.agentv/artifacts/` or the iteration directory.
 
@@ -426,7 +426,7 @@ The `references/` directory has additional documentation:
 - `references/subagent-pipeline.md` — Detailed subagent-mode pipeline commands and output structure
 - `references/description-optimization.md` — Skill description optimization workflow
 - `references/environment-adaptation.md` — Provider-specific notes and CI/headless behavior
-- `references/schemas.md` — JSON schemas for all artifacts (grading.json, benchmark.json, etc.)
+- `references/schemas.md` — JSON schemas for all artifacts (grading.json, summary.json, etc.)
 - `references/migrating-from-skill-creator.md` — Guide for users coming from Anthropic's skill-creator
 
 ---
diff --git a/skills-data/agentv-bench/agents/analyzer.md b/skills-data/agentv-bench/agents/analyzer.md
index 9f32dab7d..287e7682d 100644
--- a/skills-data/agentv-bench/agents/analyzer.md
+++ b/skills-data/agentv-bench/agents/analyzer.md
@@ -133,7 +133,7 @@ If a section has no findings, include the header with "None found." underneath.
 
 When analyzing benchmark results across multiple runs (e.g., across iterations or targets), the analyzer surfaces patterns the aggregate stats would hide.
 
-**Additional input:** `benchmark-data-path` — path to benchmark.json with all run results.
+**Additional input:** `benchmark-data-path` — path to summary.json with all run results.
 
 ### Cross-Run Pattern Analysis
 
diff --git a/skills-data/agentv-bench/references/autoresearch.md b/skills-data/agentv-bench/references/autoresearch.md
index 037b3f469..58d6a76e7 100644
--- a/skills-data/agentv-bench/references/autoresearch.md
+++ b/skills-data/agentv-bench/references/autoresearch.md
@@ -132,7 +132,7 @@ Each cycle is a standard eval run. Autoresearch session metadata lives in `_auto
     index.jsonl
     grading.json
     timing.json
-    benchmark.json
+    summary.json
     report.html
   2026-04-15T10-35-00/             # cycle 2 — standard run artifacts
     ...
diff --git a/skills-data/agentv-bench/references/eval-yaml-spec.md b/skills-data/agentv-bench/references/eval-yaml-spec.md
index 70cce2d56..afca72cfb 100644
--- a/skills-data/agentv-bench/references/eval-yaml-spec.md
+++ b/skills-data/agentv-bench/references/eval-yaml-spec.md
@@ -324,9 +324,9 @@ LLM grader results are read from disk at `<test-id>/llm_grader_results/<name>.js
 ```
 
 **Output:**
-- `<test-id>/grading.json` — merged grading with `graders`, `assertions`, `summary.pass_rate`
+- `<test-id>/run-1/grading.json` — merged grading with `graders`, `assertions`, `summary.pass_rate`
 - `index.jsonl` — one JSON line per test: `{test_id, score, pass, graders: [...]}`
-- `benchmark.json` — aggregate stats: `{metadata: {targets}, run_summary: {<target>: {mean, stddev, n}}}`
+- `summary.json` — aggregate stats: `{metadata: {targets}, run_summary: {<target>: {mean, stddev, n}}}`
 
 ### Agent-Mode Workflow
 
diff --git a/skills-data/agentv-bench/references/migrating-from-skill-creator.md b/skills-data/agentv-bench/references/migrating-from-skill-creator.md
index 8d4e841a4..71d2103db 100644
--- a/skills-data/agentv-bench/references/migrating-from-skill-creator.md
+++ b/skills-data/agentv-bench/references/migrating-from-skill-creator.md
@@ -46,13 +46,13 @@ AgentV's companion artifacts are compatible with skill-creator's eval-viewer:
 | Artifact | Format | Compatible with eval-viewer |
 |----------|--------|---------------------------|
 | `<test-id>/grading.json` | Per-assertion evidence with claims | ✅ Superset of skill-creator's per-test grading format |
-| `benchmark.json` | Aggregate pass rates, timing, patterns | ✅ Superset of Agent Skills benchmark format |
+| `summary.json` | Aggregate pass rates, timing, patterns | ✅ Superset of Agent Skills benchmark format |
 | Results JSONL | Per-test results | ✅ Standard JSONL format |
 
 AgentV's schemas are supersets — they include all fields skill-creator expects, plus additional fields (claims extraction, pattern analysis, deterministic upgrade candidates). Tools that read skill-creator artifacts will read AgentV artifacts correctly, ignoring the extra fields.
 
 The optimizer scripts layer reads those same artifacts directly:
-- `aggregate-benchmark.ts` consumes `benchmark.json`, `timing.json`, and results JSONL
+- `aggregate-benchmark.ts` consumes `summary.json`, `timing.json`, and results JSONL
 - `generate-report.ts` and `eval-viewer/generate-review.ts` render review output from AgentV artifacts
 - `improve-description.ts` proposes follow-up experiments from benchmark/grading observations
 
diff --git a/skills-data/agentv-bench/references/schemas.md b/skills-data/agentv-bench/references/schemas.md
index a2e430887..c91e87805 100644
--- a/skills-data/agentv-bench/references/schemas.md
+++ b/skills-data/agentv-bench/references/schemas.md
@@ -85,7 +85,7 @@ Tracks version progression in Improve mode. Located at workspace root.
 
 ## grading.json
 
-Output from the grader agent. Located at `<run-dir>/grading.json`.
+Output from the grader agent. Located at `<case-dir>/run-N/grading.json`.
 
 **Important:** The `assertions` array must use the fields `text`, `passed`, and `evidence` — downstream tooling depends on these exact field names.
 
@@ -164,7 +164,7 @@ Output from the grader agent. Located at `<run-dir>/grading.json`.
 
 ## metrics.json
 
-Output from the executor agent. Located at `<run-dir>/metrics.json`.
+Output from the executor agent. Located at `<case-dir>/run-N/metrics.json`.
 
 ```json
 {
@@ -198,7 +198,7 @@ Output from the executor agent. Located at `<run-dir>/metrics.json`.
 
 ## timing.json
 
-Wall clock timing for a run. Located at `<run-dir>/timing.json`.
+Wall clock timing for a run attempt. Located at `<case-dir>/run-N/timing.json`.
 
 **How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
 
@@ -218,9 +218,9 @@ Wall clock timing for a run. Located at `<run-dir>/timing.json`.
 
 ---
 
-## benchmark.json
+## summary.json
 
-Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
+Output from Benchmark mode. Located at the run root `summary.json`.
 
 ```json
 {
@@ -304,7 +304,7 @@ Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
   - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
 - `notes`: Freeform observations from the analyzer
 
-**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
+**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating summary.json manually.
 
 ---
 
diff --git a/skills-data/agentv-bench/references/subagent-pipeline.md b/skills-data/agentv-bench/references/subagent-pipeline.md
index 91b77e637..6d1ba7510 100644
--- a/skills-data/agentv-bench/references/subagent-pipeline.md
+++ b/skills-data/agentv-bench/references/subagent-pipeline.md
@@ -165,7 +165,7 @@ the eval.yaml. The target is recorded in `manifest.json` — one run = one targe
 .agentv/results/<experiment>/<timestamp>/
 ├── manifest.json                    ← eval metadata, target, test_ids
 ├── index.jsonl                      ← per-test scores
-├── benchmark.json                   ← aggregate statistics
+├── summary.json                   ← aggregate statistics
 └── <evalset-name>/                  ← eval.yaml "name" field, or eval file basename if absent (same as CLI mode)
     └── <test-id>/                   ← test case id
         ├── input.json               ← test input text + messages