EntityProcess · christso · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.agents/conventions.md b/.agents/conventions.md
@@ -29,14 +29,29 @@ When spawning a subprocess with an explicit `cwd`, pass user-supplied `args` thr
 - Those heuristics miss bare relative paths such as `plugins/foo`, can corrupt flag-value pairs such as `--config=./x`, and duplicate behavior the subprocess already handles.
 - See `docs/solutions/best-practices/trust-subprocess-cwd-for-relative-path-resolution.md`.
 
+## Git Remote Ownership
+
+Treat an existing Git checkout's remote configuration as user-owned state.
+AgentV may read remotes, fetch from a configured remote name, and push results
+refs to that remote, but it must not run `git remote add` or `git remote
+set-url` in an existing checkout as a side effect of Dashboard status, results
+sync, eval publishing, or WIP checkpoint handling. This applies especially to
+`results.repo.path: .`, where the source checkout's existing `origin` is the
+authoritative remote.
+
+If AgentV needs a separate results checkout and the configured path is missing
+or empty, create it with `git clone` and the requested remote name. If the path
+already exists, use its current Git config as-is or fail with clear setup
+guidance; do not repair, rewrite, or synthesize remotes in place.
+
 ## Naming: Project vs Benchmark
 
 These terms are distinct and not interchangeable.
 
 - Project: the top-level container Dashboard organizes around, backed by a registered workspace directory with `.agentv/`, run artifacts, traces, and experiments. The registry lives in `~/.agentv/projects.yaml` and is modeled by `ProjectEntry` and `ProjectRegistry` in `packages/core/src/projects.ts`.
 - Benchmark: a curated eval suite designed to measure something specific, in the academic ML sense. Example directories using this meaning are correctly named and should not be renamed.
 
-The legacy `~/.agentv/benchmarks.yaml` file is auto-migrated to `projects.yaml` by `migrateLegacyBenchmarksFile()`. The unrelated per-run `benchmark.json` artifact is a third, separate concept and should keep that name.
+The legacy `~/.agentv/benchmarks.yaml` file is auto-migrated to `projects.yaml` by `migrateLegacyBenchmarksFile()`. Run-level results metadata lives in `summary.json`, with `index.jsonl` as the discovery anchor.
 
 Rule of thumb:
 

diff --git a/STRATEGY.md b/STRATEGY.md
@@ -21,7 +21,7 @@ AgentV stays repo-native and workspace-native: it runs or imports evaluations ar
 
 - **Repo-native eval success** - Share of dogfood and example eval flows that run against real workspaces, hooks, repo materialization, or imported artifacts without extra infrastructure; measured by CI and manual UAT on canonical suites.
 - **Time to inspect a run** - Time from completed `agentv eval` to usable local review, compare, or report output from the canonical run bundle; measured through CLI and Dashboard/report workflows.
-- **Artifact portability coverage** - Share of integrations and follow-on workflows that consume `index.jsonl`, `benchmark.json`, trace sidecars, or imported run bundles instead of bespoke stores; measured by adapter smoke tests, docs, and example coverage.
+- **Artifact portability coverage** - Share of integrations and follow-on workflows that consume `index.jsonl`, `summary.json`, trace sidecars, or imported run bundles instead of bespoke stores; measured by adapter smoke tests, docs, and example coverage.
 - **Git-backed results reliability** - Success rate for publish, sync, resume, and WIP checkpoint flows across local branches and dedicated results repos; measured by integration tests and manual end-to-end verification.
 
 ## Tracks

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -3,30 +3,31 @@ import path from 'node:path';
 import {
   type AdditionalResultArtifactsWriter,
   type AggregateGradingArtifact,
-  type BenchmarkArtifact,
   type EvalTest,
   type EvaluationResult,
   type ExperimentArtifactMetadata,
   type ExportDuplicatePolicy,
   type GradingArtifact,
   type IndexArtifactEntry,
   RESULT_INDEX_FILENAME,
+  RUN_SUMMARY_FILENAME,
   type ResultIndexArtifact,
+  type RunSummaryArtifact,
   type TimingArtifact,
   aggregateRunDir,
   buildAggregateGradingArtifact,
-  buildBenchmarkArtifact,
   buildIndexArtifactEntry as buildCoreIndexArtifactEntry,
   buildResultIndexArtifact as buildCoreResultIndexArtifact,
   buildGradingArtifact,
+  buildRunSummaryArtifact,
   buildTestTargetKey,
   buildTimingArtifact,
   deduplicateByTestIdTarget,
   parseJsonlResults,
   writeArtifacts,
   writeArtifactsFromResults as writeCoreArtifactsFromResults,
   writePerTestArtifacts as writeCorePerTestArtifacts,
-  writeInitialBenchmarkArtifact,
+  writeInitialRunSummaryArtifact,
 } from '@agentv/core';
 import type { TargetDefinition } from '@agentv/core';
 
@@ -39,22 +40,23 @@ import {
 export {
   aggregateRunDir,
   buildAggregateGradingArtifact,
-  buildBenchmarkArtifact,
+  buildRunSummaryArtifact,
   buildGradingArtifact,
   buildTestTargetKey,
   buildTimingArtifact,
   deduplicateByTestIdTarget,
   parseJsonlResults,
   RESULT_INDEX_FILENAME,
+  RUN_SUMMARY_FILENAME,
   writeArtifacts,
-  writeInitialBenchmarkArtifact,
+  writeInitialRunSummaryArtifact,
 };
 export type {
   AggregateGradingArtifact,
-  BenchmarkArtifact,
   GradingArtifact,
   IndexArtifactEntry,
   ResultIndexArtifact,
+  RunSummaryArtifact,
   TimingArtifact,
 };
 
@@ -90,15 +92,15 @@ export function buildIndexArtifactEntry(
   options: {
     outputDir: string;
     artifactDir?: string;
-    gradingPath: string;
-    timingPath: string;
+    gradingPath?: string;
+    timingPath?: string;
+    summaryPath?: string;
     outputPath?: string;
     answerPath?: string;
     tracePath?: string;
     transcriptPath?: string;
     metricsPath?: string;
     rawProviderLogPath?: string;
-    inputPath?: string;
     responsePath?: string;
     taskBundle?: MaterializedTaskBundlePaths;
   },
@@ -240,8 +242,7 @@ export async function writeArtifactsFromResults(
   },
 ): Promise<{
   testArtifactDir: string;
-  timingPath: string;
-  benchmarkPath: string;
+  summaryPath: string;
   indexPath: string;
 }> {
   return writeCoreArtifactsFromResults(results, outputDir, {

diff --git a/apps/cli/src/commands/eval/commands/aggregate.ts b/apps/cli/src/commands/eval/commands/aggregate.ts
@@ -6,7 +6,7 @@ import { aggregateRunDir } from '../artifact-writer.js';
 export const evalAggregateCommand = command({
   name: 'aggregate',
   description:
-    'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
+    'Recompute summary.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
   args: {
     runDir: positional({
       type: string,
@@ -16,9 +16,8 @@ export const evalAggregateCommand = command({
   },
   handler: async (args) => {
     const runDir = path.resolve(args.runDir);
-    const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir);
+    const { summaryPath, testCount, targetCount } = await aggregateRunDir(runDir);
     console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`);
-    console.log(`  Benchmark: ${benchmarkPath}`);
-    console.log(`  Timing:    ${timingPath}`);
+    console.log(`  Summary: ${summaryPath}`);
   },
 });
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -52,7 +52,7 @@ export const evalRunCommand = command({
       long: 'output',
       short: 'o',
       description:
-        'Run artifact directory (writes index.jsonl, benchmark.json, timing, and per-test artifacts)',
+        'Run artifact directory (writes index.jsonl, summary.json, and per-case artifacts)',
     }),
     outputFormat: option({
       type: optional(string),

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -60,7 +60,7 @@ import {
   deduplicateByTestIdTarget,
   parseJsonlResults,
   writeArtifactsFromResults,
-  writeInitialBenchmarkArtifact,
+  writeInitialRunSummaryArtifact,
 } from './artifact-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { resolveOtelBackend } from './otel-backends.js';
@@ -1996,15 +1996,15 @@ export async function runEvalCommand(
     );
   }
 
-  // Write a stub benchmark.json before dispatching tests, carrying the planned
+  // Write a stub summary.json before dispatching tests, carrying the planned
   // execution count so an interrupted run can still surface as resumable in
   // Dashboard (results.length < planned_test_count) even when every recorded row
   // has execution_status: ok. The end-of-run write preserves this value via
   // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
   // Skip on resume — we want to preserve the *original* planned count.
   if (!isResumeAppend && totalEvalCount > 0) {
     const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
-    await writeInitialBenchmarkArtifact(runDir, {
+    await writeInitialRunSummaryArtifact(runDir, {
       evalFile,
       plannedTestCount: totalEvalCount,
       experiment: normalizeExperimentName(options.experiment),
@@ -2262,42 +2262,36 @@ export async function runEvalCommand(
           sourceTests,
           taskBundleTargets,
         });
-        const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir(
+        const { summaryPath } = await aggregateRunDir(runDir, {
+          evalFile,
+          experiment: normalizeExperimentName(options.experiment),
+          experimentMetadata: options.experimentMetadata,
+        });
+        const indexPath = path.join(runDir, 'index.jsonl');
+        console.log(`Artifact workspace updated: ${runDir}`);
+        console.log(`  Index: ${indexPath}`);
+        console.log(`  Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
+        console.log(`  Summary: ${summaryPath}`);
+      } else {
+        const { testArtifactDir, summaryPath, indexPath } = await writeArtifactsFromResults(
+          allResults,
           runDir,
           {
             evalFile,
             experiment: normalizeExperimentName(options.experiment),
             experimentMetadata: options.experimentMetadata,
+            cwd,
+            repoRoot,
+            sourceTests,
+            taskBundleTargets,
           },
         );
-        const indexPath = path.join(runDir, 'index.jsonl');
-        console.log(`Artifact workspace updated: ${runDir}`);
-        console.log(`  Index: ${indexPath}`);
-        console.log(`  Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
-        console.log(`  Timing: ${timingPath}`);
-        console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
-      } else {
-        const {
-          testArtifactDir,
-          timingPath,
-          benchmarkPath: workspaceBenchmarkPath,
-          indexPath,
-        } = await writeArtifactsFromResults(allResults, runDir, {
-          evalFile,
-          experiment: normalizeExperimentName(options.experiment),
-          experimentMetadata: options.experimentMetadata,
-          cwd,
-          repoRoot,
-          sourceTests,
-          taskBundleTargets,
-        });
         console.log(`Artifact workspace written to: ${runDir}`);
         console.log(`  Index: ${indexPath}`);
         console.log(
           `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
         );
-        console.log(`  Timing: ${timingPath}`);
-        console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
+        console.log(`  Summary: ${summaryPath}`);
       }
     }
 

diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts
@@ -577,10 +577,10 @@ function buildRunId(relativeRunPath: string): string {
 
 function readRunDisplayName(runDir: string): string | undefined {
   try {
-    const benchmark = JSON.parse(readFileSync(path.join(runDir, 'benchmark.json'), 'utf8')) as {
+    const summary = JSON.parse(readFileSync(path.join(runDir, 'summary.json'), 'utf8')) as {
       metadata?: { display_name?: unknown };
     };
-    const displayName = benchmark.metadata?.display_name;
+    const displayName = summary.metadata?.display_name;
     return typeof displayName === 'string' && displayName.trim() ? displayName.trim() : undefined;
   } catch {
     return undefined;

diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -7,7 +7,7 @@
  * Writes:
  *   - <test-id>/grading.json  (per-test grading breakdown)
  *   - index.jsonl             (one line per test)
- *   - benchmark.json          (aggregate statistics)
+ *   - summary.json            (aggregate statistics)
  */
 import { existsSync } from 'node:fs';
 import { readFile, readdir, writeFile } from 'node:fs/promises';
@@ -199,9 +199,9 @@ export const evalBenchCommand = command({
       'utf8',
     );
 
-    // Write benchmark.json
+    // Write summary.json
     const passRateStats = computeStats(allPassRates);
-    const benchmark = {
+    const summary = {
       metadata: {
         eval_file: manifest.eval_file,
         timestamp: manifest.timestamp,
@@ -216,11 +216,24 @@ export const evalBenchCommand = command({
           tokens: { mean: 0, stddev: 0 },
         },
       },
+      timing: {
+        total_tokens: 0,
+        duration_ms: 0,
+        total_duration_seconds: 0,
+        cost_usd: null,
+        token_usage: { input: 0, output: 0, reasoning: 0 },
+        usage_sources: {
+          token_usage: 'unavailable',
+          total_tokens: 'unavailable',
+          duration: 'unavailable',
+          cost: 'unavailable',
+        },
+      },
       notes: [],
     };
     await writeFile(
-      join(exportDir, 'benchmark.json'),
-      `${JSON.stringify(benchmark, null, 2)}\n`,
+      join(exportDir, 'summary.json'),
+      `${JSON.stringify(summary, null, 2)}\n`,
       'utf8',
     );