diff --git a/.agents/conventions.md b/.agents/conventions.md index ca4da8dbf..6e1d0deae 100644 --- a/.agents/conventions.md +++ b/.agents/conventions.md @@ -29,6 +29,21 @@ When spawning a subprocess with an explicit `cwd`, pass user-supplied `args` thr - Those heuristics miss bare relative paths such as `plugins/foo`, can corrupt flag-value pairs such as `--config=./x`, and duplicate behavior the subprocess already handles. - See `docs/solutions/best-practices/trust-subprocess-cwd-for-relative-path-resolution.md`. +## Git Remote Ownership + +Treat an existing Git checkout's remote configuration as user-owned state. +AgentV may read remotes, fetch from a configured remote name, and push results +refs to that remote, but it must not run `git remote add` or `git remote +set-url` in an existing checkout as a side effect of Dashboard status, results +sync, eval publishing, or WIP checkpoint handling. This applies especially to +`results.repo.path: .`, where the source checkout's existing `origin` is the +authoritative remote. + +If AgentV needs a separate results checkout and the configured path is missing +or empty, create it with `git clone` and the requested remote name. If the path +already exists, use its current Git config as-is or fail with clear setup +guidance; do not repair, rewrite, or synthesize remotes in place. + ## Naming: Project vs Benchmark These terms are distinct and not interchangeable. @@ -36,7 +51,7 @@ These terms are distinct and not interchangeable. - Project: the top-level container Dashboard organizes around, backed by a registered workspace directory with `.agentv/`, run artifacts, traces, and experiments. The registry lives in `~/.agentv/projects.yaml` and is modeled by `ProjectEntry` and `ProjectRegistry` in `packages/core/src/projects.ts`. - Benchmark: a curated eval suite designed to measure something specific, in the academic ML sense. Example directories using this meaning are correctly named and should not be renamed. -The legacy `~/.agentv/benchmarks.yaml` file is auto-migrated to `projects.yaml` by `migrateLegacyBenchmarksFile()`. The unrelated per-run `benchmark.json` artifact is a third, separate concept and should keep that name. +The legacy `~/.agentv/benchmarks.yaml` file is auto-migrated to `projects.yaml` by `migrateLegacyBenchmarksFile()`. Run-level results metadata lives in `summary.json`, with `index.jsonl` as the discovery anchor. Rule of thumb: diff --git a/STRATEGY.md b/STRATEGY.md index a021b0101..5f52388d9 100644 --- a/STRATEGY.md +++ b/STRATEGY.md @@ -21,7 +21,7 @@ AgentV stays repo-native and workspace-native: it runs or imports evaluations ar - **Repo-native eval success** - Share of dogfood and example eval flows that run against real workspaces, hooks, repo materialization, or imported artifacts without extra infrastructure; measured by CI and manual UAT on canonical suites. - **Time to inspect a run** - Time from completed `agentv eval` to usable local review, compare, or report output from the canonical run bundle; measured through CLI and Dashboard/report workflows. -- **Artifact portability coverage** - Share of integrations and follow-on workflows that consume `index.jsonl`, `benchmark.json`, trace sidecars, or imported run bundles instead of bespoke stores; measured by adapter smoke tests, docs, and example coverage. +- **Artifact portability coverage** - Share of integrations and follow-on workflows that consume `index.jsonl`, `summary.json`, trace sidecars, or imported run bundles instead of bespoke stores; measured by adapter smoke tests, docs, and example coverage. - **Git-backed results reliability** - Success rate for publish, sync, resume, and WIP checkpoint flows across local branches and dedicated results repos; measured by integration tests and manual end-to-end verification. ## Tracks diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 09e2d098e..e1591675b 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -3,7 +3,6 @@ import path from 'node:path'; import { type AdditionalResultArtifactsWriter, type AggregateGradingArtifact, - type BenchmarkArtifact, type EvalTest, type EvaluationResult, type ExperimentArtifactMetadata, @@ -11,14 +10,16 @@ import { type GradingArtifact, type IndexArtifactEntry, RESULT_INDEX_FILENAME, + RUN_SUMMARY_FILENAME, type ResultIndexArtifact, + type RunSummaryArtifact, type TimingArtifact, aggregateRunDir, buildAggregateGradingArtifact, - buildBenchmarkArtifact, buildIndexArtifactEntry as buildCoreIndexArtifactEntry, buildResultIndexArtifact as buildCoreResultIndexArtifact, buildGradingArtifact, + buildRunSummaryArtifact, buildTestTargetKey, buildTimingArtifact, deduplicateByTestIdTarget, @@ -26,7 +27,7 @@ import { writeArtifacts, writeArtifactsFromResults as writeCoreArtifactsFromResults, writePerTestArtifacts as writeCorePerTestArtifacts, - writeInitialBenchmarkArtifact, + writeInitialRunSummaryArtifact, } from '@agentv/core'; import type { TargetDefinition } from '@agentv/core'; @@ -39,22 +40,23 @@ import { export { aggregateRunDir, buildAggregateGradingArtifact, - buildBenchmarkArtifact, + buildRunSummaryArtifact, buildGradingArtifact, buildTestTargetKey, buildTimingArtifact, deduplicateByTestIdTarget, parseJsonlResults, RESULT_INDEX_FILENAME, + RUN_SUMMARY_FILENAME, writeArtifacts, - writeInitialBenchmarkArtifact, + writeInitialRunSummaryArtifact, }; export type { AggregateGradingArtifact, - BenchmarkArtifact, GradingArtifact, IndexArtifactEntry, ResultIndexArtifact, + RunSummaryArtifact, TimingArtifact, }; @@ -90,15 +92,15 @@ export function buildIndexArtifactEntry( options: { outputDir: string; artifactDir?: string; - gradingPath: string; - timingPath: string; + gradingPath?: string; + timingPath?: string; + summaryPath?: string; outputPath?: string; answerPath?: string; tracePath?: string; transcriptPath?: string; metricsPath?: string; rawProviderLogPath?: string; - inputPath?: string; responsePath?: string; taskBundle?: MaterializedTaskBundlePaths; }, @@ -240,8 +242,7 @@ export async function writeArtifactsFromResults( }, ): Promise<{ testArtifactDir: string; - timingPath: string; - benchmarkPath: string; + summaryPath: string; indexPath: string; }> { return writeCoreArtifactsFromResults(results, outputDir, { diff --git a/apps/cli/src/commands/eval/commands/aggregate.ts b/apps/cli/src/commands/eval/commands/aggregate.ts index 7483b8412..275e792ab 100644 --- a/apps/cli/src/commands/eval/commands/aggregate.ts +++ b/apps/cli/src/commands/eval/commands/aggregate.ts @@ -6,7 +6,7 @@ import { aggregateRunDir } from '../artifact-writer.js'; export const evalAggregateCommand = command({ name: 'aggregate', description: - 'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.', + 'Recompute summary.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.', args: { runDir: positional({ type: string, @@ -16,9 +16,8 @@ export const evalAggregateCommand = command({ }, handler: async (args) => { const runDir = path.resolve(args.runDir); - const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir); + const { summaryPath, testCount, targetCount } = await aggregateRunDir(runDir); console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`); - console.log(` Benchmark: ${benchmarkPath}`); - console.log(` Timing: ${timingPath}`); + console.log(` Summary: ${summaryPath}`); }, }); diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index a498f4ccc..3b392ce7c 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -52,7 +52,7 @@ export const evalRunCommand = command({ long: 'output', short: 'o', description: - 'Run artifact directory (writes index.jsonl, benchmark.json, timing, and per-test artifacts)', + 'Run artifact directory (writes index.jsonl, summary.json, and per-case artifacts)', }), outputFormat: option({ type: optional(string), diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 5e2eb1058..014c0ab77 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -60,7 +60,7 @@ import { deduplicateByTestIdTarget, parseJsonlResults, writeArtifactsFromResults, - writeInitialBenchmarkArtifact, + writeInitialRunSummaryArtifact, } from './artifact-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { resolveOtelBackend } from './otel-backends.js'; @@ -1996,7 +1996,7 @@ export async function runEvalCommand( ); } - // Write a stub benchmark.json before dispatching tests, carrying the planned + // Write a stub summary.json before dispatching tests, carrying the planned // execution count so an interrupted run can still surface as resumable in // Dashboard (results.length < planned_test_count) even when every recorded row // has execution_status: ok. The end-of-run write preserves this value via @@ -2004,7 +2004,7 @@ export async function runEvalCommand( // Skip on resume — we want to preserve the *original* planned count. if (!isResumeAppend && totalEvalCount > 0) { const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; - await writeInitialBenchmarkArtifact(runDir, { + await writeInitialRunSummaryArtifact(runDir, { evalFile, plannedTestCount: totalEvalCount, experiment: normalizeExperimentName(options.experiment), @@ -2262,42 +2262,36 @@ export async function runEvalCommand( sourceTests, taskBundleTargets, }); - const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir( + const { summaryPath } = await aggregateRunDir(runDir, { + evalFile, + experiment: normalizeExperimentName(options.experiment), + experimentMetadata: options.experimentMetadata, + }); + const indexPath = path.join(runDir, 'index.jsonl'); + console.log(`Artifact workspace updated: ${runDir}`); + console.log(` Index: ${indexPath}`); + console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`); + console.log(` Summary: ${summaryPath}`); + } else { + const { testArtifactDir, summaryPath, indexPath } = await writeArtifactsFromResults( + allResults, runDir, { evalFile, experiment: normalizeExperimentName(options.experiment), experimentMetadata: options.experimentMetadata, + cwd, + repoRoot, + sourceTests, + taskBundleTargets, }, ); - const indexPath = path.join(runDir, 'index.jsonl'); - console.log(`Artifact workspace updated: ${runDir}`); - console.log(` Index: ${indexPath}`); - console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`); - console.log(` Timing: ${timingPath}`); - console.log(` Benchmark: ${workspaceBenchmarkPath}`); - } else { - const { - testArtifactDir, - timingPath, - benchmarkPath: workspaceBenchmarkPath, - indexPath, - } = await writeArtifactsFromResults(allResults, runDir, { - evalFile, - experiment: normalizeExperimentName(options.experiment), - experimentMetadata: options.experimentMetadata, - cwd, - repoRoot, - sourceTests, - taskBundleTargets, - }); console.log(`Artifact workspace written to: ${runDir}`); console.log(` Index: ${indexPath}`); console.log( ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, ); - console.log(` Timing: ${timingPath}`); - console.log(` Benchmark: ${workspaceBenchmarkPath}`); + console.log(` Summary: ${summaryPath}`); } } diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index 0df6e0c53..fa6101b5f 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -577,10 +577,10 @@ function buildRunId(relativeRunPath: string): string { function readRunDisplayName(runDir: string): string | undefined { try { - const benchmark = JSON.parse(readFileSync(path.join(runDir, 'benchmark.json'), 'utf8')) as { + const summary = JSON.parse(readFileSync(path.join(runDir, 'summary.json'), 'utf8')) as { metadata?: { display_name?: unknown }; }; - const displayName = benchmark.metadata?.display_name; + const displayName = summary.metadata?.display_name; return typeof displayName === 'string' && displayName.trim() ? displayName.trim() : undefined; } catch { return undefined; diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index f7466f0ed..fed6dce88 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -7,7 +7,7 @@ * Writes: * - /grading.json (per-test grading breakdown) * - index.jsonl (one line per test) - * - benchmark.json (aggregate statistics) + * - summary.json (aggregate statistics) */ import { existsSync } from 'node:fs'; import { readFile, readdir, writeFile } from 'node:fs/promises'; @@ -199,9 +199,9 @@ export const evalBenchCommand = command({ 'utf8', ); - // Write benchmark.json + // Write summary.json const passRateStats = computeStats(allPassRates); - const benchmark = { + const summary = { metadata: { eval_file: manifest.eval_file, timestamp: manifest.timestamp, @@ -216,11 +216,24 @@ export const evalBenchCommand = command({ tokens: { mean: 0, stddev: 0 }, }, }, + timing: { + total_tokens: 0, + duration_ms: 0, + total_duration_seconds: 0, + cost_usd: null, + token_usage: { input: 0, output: 0, reasoning: 0 }, + usage_sources: { + token_usage: 'unavailable', + total_tokens: 'unavailable', + duration: 'unavailable', + cost: 'unavailable', + }, + }, notes: [], }; await writeFile( - join(exportDir, 'benchmark.json'), - `${JSON.stringify(benchmark, null, 2)}\n`, + join(exportDir, 'summary.json'), + `${JSON.stringify(summary, null, 2)}\n`, 'utf8', ); diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts index 18d7bbb99..113fa2461 100644 --- a/apps/cli/src/commands/results/combine-run.ts +++ b/apps/cli/src/commands/results/combine-run.ts @@ -5,8 +5,8 @@ * Combines two or more local run workspace manifests into a new local run * workspace. The writer keeps per-test artifacts self-contained by copying * referenced source files under `sources/source-N/` and rewriting manifest - * paths, while recomputing top-level `timing.json` and `benchmark.json` from - * the selected result rows. + * paths, while recomputing top-level `summary.json` from the selected result + * rows. */ import { @@ -28,10 +28,9 @@ import type { } from '@agentv/core'; import { - type BenchmarkArtifact, - buildBenchmarkArtifact, + type RunSummaryArtifact, + buildRunSummaryArtifact, buildTestTargetKey, - buildTimingArtifact, } from '../eval/artifact-writer.js'; import { buildDefaultRunDirFromName, @@ -103,8 +102,7 @@ export interface CombineRunResult { readonly runDir: string; readonly runId: string; readonly manifestPath: string; - readonly benchmarkPath: string; - readonly timingPath: string; + readonly summaryPath: string; readonly displayName: string; readonly experiment: string; readonly combinedFromRunIds: readonly string[]; @@ -126,13 +124,13 @@ function readManifestRecords(manifestPath: string): ResultManifestRecord[] { .map(parseJsonlLine); } -function readBenchmarkMetadata(manifestPath: string): { +function readSummaryMetadata(manifestPath: string): { timestamp?: string; displayName?: string; } { try { - const benchmarkPath = path.join(path.dirname(manifestPath), 'benchmark.json'); - const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as { + const summaryPath = path.join(path.dirname(manifestPath), 'summary.json'); + const parsed = JSON.parse(readFileSync(summaryPath, 'utf8')) as { metadata?: { timestamp?: string; display_name?: string }; }; return { @@ -168,7 +166,7 @@ function loadSources(sources: readonly CombineRunSource[]): LoadedSource[] { if (records.length !== results.length) { throw new Error(`Manifest could not be hydrated completely: ${manifestPath}`); } - const metadata = readBenchmarkMetadata(manifestPath); + const metadata = readSummaryMetadata(manifestPath); return { ...source, index, @@ -363,7 +361,6 @@ function resolveCombinedExperiment( const MANIFEST_PATH_FIELDS = [ 'artifact_dir', - 'benchmark_path', 'summary_path', 'grading_path', 'timing_path', @@ -604,13 +601,9 @@ export function combineRunSources(options: CombineRunOptions): CombineRunResult const manifestPath = path.join(runDir, 'index.jsonl'); writeJsonl(manifestPath, records); - const timing = buildTimingArtifact(results); - const timingPath = path.join(runDir, 'timing.json'); - writeJson(timingPath, timing); - - const benchmark = buildBenchmarkArtifact(results, '', 'combined', results.length); - const benchmarkWithMetadata: BenchmarkArtifact & { - metadata: BenchmarkArtifact['metadata'] & { + const summary = buildRunSummaryArtifact(results, '', 'combined', results.length); + const summaryWithMetadata: RunSummaryArtifact & { + metadata: RunSummaryArtifact['metadata'] & { display_name: string; combined_from_run_ids: readonly string[]; combined_from_display_names: readonly string[]; @@ -618,10 +611,10 @@ export function combineRunSources(options: CombineRunOptions): CombineRunResult duplicate_policy: Exclude | 'prompt'; }; } = { - ...benchmark, + ...summary, metadata: { - ...benchmark.metadata, - timestamp: startedAt ?? benchmark.metadata.timestamp, + ...summary.metadata, + timestamp: startedAt ?? summary.metadata.timestamp, display_name: displayName, experiment, combined_from_run_ids: loadedSources.map((source) => source.id), @@ -629,16 +622,15 @@ export function combineRunSources(options: CombineRunOptions): CombineRunResult duplicate_policy: options.duplicatePolicy, }, }; - const benchmarkPath = path.join(runDir, 'benchmark.json'); - writeJson(benchmarkPath, benchmarkWithMetadata); + const summaryPath = path.join(runDir, 'summary.json'); + writeJson(summaryPath, summaryWithMetadata); const tags = [...new Set(loadedSources.flatMap((source) => source.tags ?? []))].sort(); return { runDir, runId: toRunId(options.cwd, runDir), manifestPath, - benchmarkPath, - timingPath, + summaryPath, displayName, experiment, combinedFromRunIds: loadedSources.map((source) => source.id), diff --git a/apps/cli/src/commands/results/combine.ts b/apps/cli/src/commands/results/combine.ts index 2f96c4e38..4566df69f 100644 --- a/apps/cli/src/commands/results/combine.ts +++ b/apps/cli/src/commands/results/combine.ts @@ -114,7 +114,7 @@ export const resultsCombineCommand = command({ displayName: option({ type: optional(string), long: 'display-name', - description: 'Display name stored in benchmark.json metadata', + description: 'Display name stored in summary.json metadata', }), duplicatePolicy: option({ type: optional(oneOf(['prompt', 'error', 'latest'])), @@ -175,8 +175,7 @@ export const resultsCombineCommand = command({ }); console.log(`Combined ${result.testCount} result row(s) into ${result.runDir}`); console.log(` Run ID: ${result.runId}`); - console.log(` Benchmark: ${result.benchmarkPath}`); - console.log(` Timing: ${result.timingPath}`); + console.log(` Summary: ${result.summaryPath}`); if (result.duplicateConflicts.length > 0) { console.log(` Duplicates handled: ${result.duplicateConflicts.length}`); } diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index bfd4e51df..83dbec6f0 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -4,16 +4,16 @@ * * Output structure: * / - * benchmark.json — aggregate scores, pass/fail counts, timing + * summary.json — run aggregate scores, metadata, and timing * index.jsonl — per-test manifest with artifact pointers * / - * grading.json — per-test grading artifact (assertions, graders) - * timing.json — per-test timing artifact - * outputs/answer.md — human-readable agent response for this test - * task/PROMPT.md — human-readable input messages for this test + * summary.json — per-case aggregate + * run-1/result.json — per-run result + * run-1/grading.json — per-run grading artifact (assertions, graders) + * run-1/metrics.json — per-run metrics artifact * * This module delegates artifact building to the shared artifact-writer so - * that benchmark/grading/timing schemas stay aligned with `agentv eval`. + * that summary/grading/timing schemas stay aligned with `agentv eval`. * * How to extend: * - To change artifact schemas, update artifact-writer.ts (single source of truth). diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 679cff1ef..0372b79e8 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -29,6 +29,14 @@ export interface ResultManifestRecord { readonly target?: string; readonly score: number; readonly scores?: readonly Record[]; + readonly trials?: readonly { + readonly attempt?: number; + readonly run_path?: string; + readonly score?: number; + readonly verdict?: string; + readonly [key: string]: unknown; + }[]; + readonly aggregation?: Record; readonly execution_status?: string; readonly error?: string; readonly cost_usd?: number; @@ -39,7 +47,6 @@ export interface ResultManifestRecord { readonly reasoning?: number; }; readonly trace?: Record; - readonly benchmark_path?: string; readonly summary_path?: string; readonly grading_path?: string; readonly timing_path?: string; diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts index e6b69c6d5..042259775 100644 --- a/apps/cli/src/commands/results/projection-bundle.ts +++ b/apps/cli/src/commands/results/projection-bundle.ts @@ -87,7 +87,6 @@ export type ProjectionBundleArtifactRefs = Partial< Pick< IndexArtifactEntry, | 'artifact_dir' - | 'benchmark_path' | 'summary_path' | 'grading_path' | 'timing_path' @@ -173,7 +172,6 @@ function artifactRefs( return dropUndefined({ ...metadataRefs, artifact_dir: indexEntry.artifact_dir, - benchmark_path: indexEntry.benchmark_path, summary_path: indexEntry.summary_path, grading_path: indexEntry.grading_path, input_path: indexEntry.input_path, diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts index 109b584d3..113523dfd 100644 --- a/apps/cli/src/commands/results/report.ts +++ b/apps/cli/src/commands/results/report.ts @@ -13,7 +13,7 @@ interface ReportManifestRecord { readonly eval_file?: string; } -interface BenchmarkMetadata { +interface RunSummaryMetadata { readonly metadata?: { readonly eval_file?: string; }; @@ -33,15 +33,15 @@ function normalizeEvalFileLabel(value: string | undefined): string | undefined { .replace(/\.jsonl$/i, ''); } -function readBenchmarkEvalFile(sourceFile: string): string | undefined { - const benchmarkPath = path.join(path.dirname(sourceFile), 'benchmark.json'); - if (!existsSync(benchmarkPath)) { +function readSummaryEvalFile(sourceFile: string): string | undefined { + const summaryPath = path.join(path.dirname(sourceFile), 'summary.json'); + if (!existsSync(summaryPath)) { return undefined; } try { - const benchmark = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as BenchmarkMetadata; - return normalizeEvalFileLabel(benchmark.metadata?.eval_file); + const summary = JSON.parse(readFileSync(summaryPath, 'utf8')) as RunSummaryMetadata; + return normalizeEvalFileLabel(summary.metadata?.eval_file); } catch { return undefined; } @@ -55,11 +55,11 @@ function serializeReportResult( result: EvaluationResult, sourceFile: string, manifestRecord?: ReportManifestRecord, - benchmarkEvalFile?: string, + summaryEvalFile?: string, ): Record { const fallbackEvalFile = normalizeEvalFileLabel(manifestRecord?.eval_file) ?? - benchmarkEvalFile ?? + summaryEvalFile ?? normalizeEvalFileLabel(result.suite) ?? path.basename(path.dirname(sourceFile)); @@ -90,7 +90,7 @@ export async function loadReportSource( sourceFile: string; results: EvaluationResult[]; records: readonly ReportManifestRecord[]; - benchmarkEvalFile?: string; + summaryEvalFile?: string; }> { const { sourceFile } = await resolveSourceFile(source, cwd); const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd); @@ -106,7 +106,7 @@ export async function loadReportSource( sourceFile: resolvedSourceFile, results, records, - benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile), + summaryEvalFile: readSummaryEvalFile(resolvedSourceFile), }; } @@ -114,14 +114,14 @@ export function renderResultsReport( results: readonly EvaluationResult[], sourceFile: string, records: readonly ReportManifestRecord[], - benchmarkEvalFile?: string, + summaryEvalFile?: string, ): string { if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) { throw new Error('Report template is missing __DATA_PLACEHOLDER__'); } const rows = results.map((result, index) => - serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile), + serializeReportResult(result, sourceFile, records[index], summaryEvalFile), ); const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/'); return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', () => dataJson); @@ -132,13 +132,13 @@ export async function writeResultsReport( outputPath: string | undefined, cwd: string, ): Promise<{ sourceFile: string; outputPath: string; html: string }> { - const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd); + const { sourceFile, results, records, summaryEvalFile } = await loadReportSource(source, cwd); const resolvedOutputPath = outputPath ? path.isAbsolute(outputPath) ? outputPath : path.resolve(cwd, outputPath) : deriveReportPath(sourceFile); - const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile); + const html = renderResultsReport(results, sourceFile, records, summaryEvalFile); mkdirSync(path.dirname(resolvedOutputPath), { recursive: true }); writeFileSync(resolvedOutputPath, html, 'utf8'); diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 7685f7660..d6f649fce 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -640,6 +640,36 @@ function normalizeArtifactRelativePath(relativePath: string): string | undefined return segments.join('/'); } +function requestedArtifactDir(c: C): { value?: string; error?: string } { + const raw = c.req.query('artifact_dir')?.trim(); + if (!raw) { + return {}; + } + const normalized = normalizeArtifactRelativePath(raw); + if (!normalized) { + return { error: 'Invalid artifact_dir' }; + } + return { value: normalized }; +} + +function manifestRecordSelection( + records: readonly ResultManifestRecord[], + evalId: string, + artifactDir?: string, +): { record: ResultManifestRecord; index: number } | undefined { + return records + .map((record, index) => ({ record, index })) + .find(({ record }) => { + if (record.test_id !== evalId) { + return false; + } + if (!artifactDir) { + return true; + } + return normalizeArtifactRelativePath(record.artifact_dir ?? '') === artifactDir; + }); +} + function relativeRunPathFromNormalizedManifestPath(manifestPath: string): string | undefined { const parts = manifestPath.split('/').filter(Boolean); const runsIndex = parts.lastIndexOf('runs'); @@ -765,6 +795,46 @@ function displayPathFromArtifactKey(key: string | undefined, runPath: string | u return normalizeArtifactRelativePath(normalizedKey.slice(runPrefix.length)) ?? normalizedKey; } +function addTrialRunCatalogEntries( + entries: ArtifactCatalogEntry[], + seen: Set, + record: ResultManifestRecord, +): void { + const artifactDir = record.artifact_dir + ? normalizeArtifactRelativePath(record.artifact_dir) + : undefined; + if (!artifactDir) return; + for (const trial of record.trials ?? []) { + const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined; + if (!runPath) continue; + const runDir = path.posix.join(artifactDir, runPath); + addDirectArtifactCatalogEntry( + entries, + seen, + path.posix.join(runDir, 'result.json'), + 'artifact', + ); + addDirectArtifactCatalogEntry( + entries, + seen, + path.posix.join(runDir, 'grading.json'), + 'artifact', + ); + addDirectArtifactCatalogEntry( + entries, + seen, + path.posix.join(runDir, 'metrics.json'), + 'artifact', + ); + addDirectArtifactCatalogEntry( + entries, + seen, + path.posix.join(runDir, 'timing.json'), + 'artifact', + ); + } +} + function buildResultArtifactCatalog( record: ResultManifestRecord, options?: { readonly runPath?: string }, @@ -780,7 +850,6 @@ function buildResultArtifactCatalog( addPointerArtifactCatalogEntry(entries, seen, trace, 'trace', options?.runPath); addPointerArtifactCatalogEntry(entries, seen, answer, 'answer', options?.runPath); - addDirectArtifactCatalogEntry(entries, seen, record.benchmark_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.summary_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.grading_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.timing_path, 'artifact'); @@ -792,6 +861,7 @@ function buildResultArtifactCatalog( addDirectArtifactCatalogEntry(entries, seen, recordWithTrace.trace_path, 'trace'); addDirectArtifactCatalogEntry(entries, seen, record.eval_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.targets_path, 'artifact'); + addTrialRunCatalogEntries(entries, seen, record); return entries; } @@ -1028,6 +1098,118 @@ function stripHeavyFields(results: readonly EvaluationResult[]) { }); } +function readArtifactJsonObject( + baseDir: string, + relativePath: string | undefined, +): Record | undefined { + if (!relativePath) return undefined; + const resolved = resolveReadableRunArtifactFile(baseDir, relativePath); + if (!resolved.absolutePath) return undefined; + try { + const parsed = JSON.parse(readFileSync(resolved.absolutePath, 'utf8')) as unknown; + return parsed && typeof parsed === 'object' && !Array.isArray(parsed) + ? (parsed as Record) + : undefined; + } catch { + return undefined; + } +} + +function numberField(record: Record | undefined, key: string): number | undefined { + const value = record?.[key]; + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; +} + +function objectField( + record: Record | undefined, + key: string, +): Record | undefined { + const value = record?.[key]; + return value && typeof value === 'object' && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function caseTrialArtifactPath( + artifactDir: string | undefined, + runPath: string | undefined, + filePath: string, +): string | undefined { + if (!artifactDir || !runPath) return undefined; + return path.posix.join(artifactDir, runPath, filePath); +} + +function buildRepeatTrialReadModels( + baseDir: string, + record: ResultManifestRecord, +): Array> | undefined { + if (!record.trials || record.trials.length === 0) return undefined; + const artifactDir = record.artifact_dir + ? normalizeArtifactRelativePath(record.artifact_dir) + : undefined; + + return record.trials.map((trial) => { + const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined; + const metricsPath = caseTrialArtifactPath(artifactDir, runPath, 'metrics.json'); + const timingPath = caseTrialArtifactPath(artifactDir, runPath, 'timing.json'); + const gradingPath = caseTrialArtifactPath(artifactDir, runPath, 'grading.json'); + const transcriptPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript-raw.jsonl'); + const answerPath = caseTrialArtifactPath(artifactDir, runPath, 'outputs/answer.md'); + const metrics = readArtifactJsonObject(baseDir, metricsPath); + const timing = readArtifactJsonObject(baseDir, timingPath); + const toolCalls = objectField(metrics, 'tool_calls'); + const tokenUsage = objectField(timing, 'token_usage'); + + return { + ...trial, + ...(numberField(timing, 'duration_ms') !== undefined && { + duration_ms: numberField(timing, 'duration_ms'), + }), + ...(numberField(timing, 'total_tokens') !== undefined && { + total_tokens: numberField(timing, 'total_tokens'), + }), + ...(numberField(timing, 'cost_usd') !== undefined && { + cost_usd: numberField(timing, 'cost_usd'), + }), + ...(tokenUsage && { token_usage: tokenUsage }), + ...(numberField(metrics, 'total_tool_calls') !== undefined && { + total_tool_calls: numberField(metrics, 'total_tool_calls'), + }), + ...(toolCalls && { tool_calls: toolCalls }), + ...(metricsPath && { metrics_path: metricsPath }), + ...(timingPath && { timing_path: timingPath }), + ...(gradingPath && { grading_path: gradingPath }), + ...(transcriptPath && { transcript_path: transcriptPath }), + ...(answerPath && { answer_path: answerPath }), + }; + }); +} + +function attachRunDetailReadModelFields>( + results: readonly T[], + records: readonly ResultManifestRecord[], + baseDir: string, +): T[] { + return results.map((result, index) => { + const record = records[index]; + if (!record) return result; + const trials = buildRepeatTrialReadModels(baseDir, record); + return { + ...result, + ...(record.aggregation && { aggregation: record.aggregation }), + ...(record.artifact_dir && { artifact_dir: record.artifact_dir }), + ...(record.summary_path && { summary_path: record.summary_path }), + ...(record.grading_path && { grading_path: record.grading_path }), + ...(record.timing_path && { timing_path: record.timing_path }), + ...(record.metrics_path && { metrics_path: record.metrics_path }), + ...(record.transcript_path && { transcript_path: record.transcript_path }), + ...(record.output_path && { output_path: record.output_path }), + ...(record.answer_path && { answer_path: record.answer_path }), + ...(trials && { trials }), + }; + }); +} + // ── Shared data-route handlers ─────────────────────────────────────────── // // Each handler takes a Hono Context and a DataContext (resolved directories). @@ -1460,8 +1642,12 @@ async function handleRunDetail(c: C, { searchDir, projectId }: DataContext) { const resumeMeta = meta.source === 'local' ? deriveResumeMeta(searchDir, meta.path) : {}; const liveStatus = meta.source === 'local' ? getActiveRunStatus(meta.path) : undefined; const tagFields = await readRunTagFields(searchDir, meta, projectId); + const baseDir = path.dirname(meta.path); return c.json({ - results: attachExternalTraceFields(stripHeavyFields(loaded), records), + results: attachExternalTraceFields( + attachRunDetailReadModelFields(stripHeavyFields(loaded), records, baseDir), + records, + ), source: meta.source, source_label: meta.displayName, ...tagFields, @@ -1509,7 +1695,7 @@ function attachExternalTraceFields>( /** * Compute `run_dir` (relative to cwd, snake_case) and `suite_filter` (the - * eval file path stored in benchmark.json metadata) for a local run manifest. + * eval file path stored in summary.json metadata) for a local run manifest. * Returns whatever fields could be resolved — both are best-effort and only * needed by the Dashboard "Resume run" / "Rerun failed" actions. */ @@ -1525,9 +1711,9 @@ function deriveResumeMeta( // dir as cwd) is unusual but valid — fall through to absolute in that case. out.run_dir = relative !== '' && !relative.startsWith('..') ? relative : runDir; try { - const benchmarkPath = path.join(runDir, 'benchmark.json'); - if (existsSync(benchmarkPath)) { - const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as { + const summaryPath = path.join(runDir, 'summary.json'); + if (existsSync(summaryPath)) { + const parsed = JSON.parse(readFileSync(summaryPath, 'utf8')) as { metadata?: { eval_file?: string; planned_test_count?: number }; }; const evalFile = parsed.metadata?.eval_file; @@ -1540,7 +1726,7 @@ function deriveResumeMeta( } } } catch { - // benchmark.json missing / unreadable / malformed — leave fields unset. + // summary.json missing / unreadable / malformed — leave fields unset. } return out; } @@ -1647,14 +1833,24 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) { const filename = c.req.param('filename') ?? ''; - const evalId = c.req.param('evalId'); + const evalId = c.req.param('evalId') ?? ''; + if (!evalId) return c.json({ error: 'Eval id is required' }, 400); + const artifactDir = requestedArtifactDir(c); + if (artifactDir.error) return c.json({ error: artifactDir.error }, 400); const meta = await findRunById(searchDir, filename, projectId); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId); - const result = loaded.find((r) => r.testId === evalId); - if (!result) return c.json({ error: 'Eval not found' }, 404); - const [stripped] = stripHeavyFields([result]); + const records = await parseManifestForMeta(searchDir, meta, projectId); + const selection = manifestRecordSelection(records, evalId, artifactDir.value); + const result = selection ? loaded[selection.index] : undefined; + if (!selection || !result) return c.json({ error: 'Eval not found' }, 404); + const baseDir = path.dirname(meta.path); + const [stripped] = attachRunDetailReadModelFields( + stripHeavyFields([result]), + [selection.record], + baseDir, + ); return c.json({ eval: stripped }); } catch { return c.json({ error: 'Failed to load eval' }, 500); @@ -1663,13 +1859,17 @@ async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) { async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) { const filename = c.req.param('filename') ?? ''; - const evalId = c.req.param('evalId'); + const evalId = c.req.param('evalId') ?? ''; + if (!evalId) return c.json({ error: 'Eval id is required' }, 400); + const artifactDir = requestedArtifactDir(c); + if (artifactDir.error) return c.json({ error: artifactDir.error }, 400); const meta = await findRunById(searchDir, filename, projectId); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const records = await parseManifestForMeta(searchDir, meta, projectId); - const record = records.find((r) => r.test_id === evalId); - if (!record) return c.json({ error: 'Eval not found' }, 404); + const selection = manifestRecordSelection(records, evalId, artifactDir.value); + if (!selection) return c.json({ error: 'Eval not found' }, 404); + const { record } = selection; const baseDir = path.dirname(meta.path); const catalog = buildResultArtifactCatalog(record, { @@ -1686,7 +1886,10 @@ async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) { async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext) { const filename = c.req.param('filename') ?? ''; - const evalId = c.req.param('evalId'); + const evalId = c.req.param('evalId') ?? ''; + if (!evalId) return c.json({ error: 'Eval id is required' }, 400); + const artifactDir = requestedArtifactDir(c); + if (artifactDir.error) return c.json({ error: artifactDir.error }, 400); const meta = await findRunById(searchDir, filename, projectId); if (!meta) return c.json({ error: 'Run not found' }, 404); @@ -1705,8 +1908,9 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext await ensureRunReadable(searchDir, meta, projectId); const records = parseResultManifest(readFileSync(meta.path, 'utf8')); - const record = records.find((r) => r.test_id === evalId); - if (!record) return c.json({ error: 'Eval not found' }, 404); + const selection = manifestRecordSelection(records, evalId, artifactDir.value); + if (!selection) return c.json({ error: 'Eval not found' }, 404); + const { record } = selection; const catalog = buildResultArtifactCatalog(record, { runPath: relativeRunPathFromManifestPath(meta.path), }); @@ -1729,14 +1933,18 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext async function handleEvalTraceSession(c: C, { searchDir, projectId }: DataContext) { const filename = c.req.param('filename') ?? ''; - const evalId = c.req.param('evalId'); + const evalId = c.req.param('evalId') ?? ''; + if (!evalId) return c.json({ error: 'Eval id is required' }, 400); + const artifactDir = requestedArtifactDir(c); + if (artifactDir.error) return c.json({ error: artifactDir.error }, 400); const meta = await findRunById(searchDir, filename, projectId); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const records = await parseManifestForMeta(searchDir, meta, projectId); - const record = records.find((r) => r.test_id === evalId); - if (!record) return c.json({ error: 'Eval not found' }, 404); + const selection = manifestRecordSelection(records, evalId, artifactDir.value); + if (!selection) return c.json({ error: 'Eval not found' }, 404); + const { record } = selection; const trace = resolveRecordArtifactPointer(record, 'trace'); const runPath = relativeRunPathFromManifestPath(meta.path); @@ -1857,14 +2065,18 @@ async function handleEvalTraceSession(c: C, { searchDir, projectId }: DataContex async function handleEvalTranscript(c: C, { searchDir, projectId }: DataContext) { const filename = c.req.param('filename') ?? ''; - const evalId = c.req.param('evalId'); + const evalId = c.req.param('evalId') ?? ''; + if (!evalId) return c.json({ error: 'Eval id is required' }, 400); + const artifactDir = requestedArtifactDir(c); + if (artifactDir.error) return c.json({ error: artifactDir.error }, 400); const meta = await findRunById(searchDir, filename, projectId); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const records = await parseManifestForMeta(searchDir, meta, projectId); - const record = records.find((r) => r.test_id === evalId); - if (!record) return c.json({ error: 'Eval not found' }, 404); + const selection = manifestRecordSelection(records, evalId, artifactDir.value); + if (!selection) return c.json({ error: 'Eval not found' }, 404); + const { record } = selection; const transcript = resolveRecordArtifactPointer(record, 'transcript'); const answer = resolveRecordArtifactPointer(record, 'answer'); diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts index 680f5b0b4..cbd2f8679 100644 --- a/apps/cli/src/commands/results/validate.ts +++ b/apps/cli/src/commands/results/validate.ts @@ -5,9 +5,9 @@ * Checks: * 1. Directory follows the `.agentv/results//` naming convention * 2. index.jsonl exists and each line has required fields - * 3. Per-test grading.json exists for every entry in the index - * 4. Per-test timing.json exists for direct case rows (warning if missing) - * 5. benchmark.json exists (warning if missing) + * 3. Per-case summary.json exists for every entry in the index + * 4. Per-run result.json and grading.json exist for every materialized trial + * 5. summary.json exists * 6. Scores are within [0, 1] * 7. index.jsonl entries have `scores[]` array (warning if missing — dashboard needs it) * @@ -34,10 +34,11 @@ interface IndexEntry { readonly target?: string; readonly scores?: unknown[]; readonly execution_status?: string; - readonly benchmark_path?: string; readonly summary_path?: string; readonly grading_path?: string; readonly timing_path?: string; + readonly artifact_dir?: string; + readonly trials?: readonly { readonly run_path?: string }[]; readonly [key: string]: unknown; } @@ -141,10 +142,10 @@ function checkIndexJsonl(runDir: string): { diagnostics: Diagnostic[]; entries: }); } - if (!entry.grading_path && !entry.benchmark_path) { + if (!entry.summary_path) { diagnostics.push({ - severity: 'warning', - message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'grading_path' or 'benchmark_path'`, + severity: 'error', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'summary_path'`, }); } @@ -215,12 +216,23 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[] } } - if (entry.benchmark_path) { - const benchmarkPath = path.join(runDir, entry.benchmark_path); - if (!existsSync(benchmarkPath)) { + for (const trial of entry.trials ?? []) { + if (!entry.artifact_dir || !trial.run_path) { + continue; + } + const runDirPath = path.join(runDir, entry.artifact_dir, trial.run_path); + const resultPath = path.join(runDirPath, 'result.json'); + const gradingPath = path.join(runDirPath, 'grading.json'); + if (!existsSync(resultPath)) { + diagnostics.push({ + severity: 'error', + message: `${testId}: result.json not found at '${path.posix.join(entry.artifact_dir, trial.run_path, 'result.json')}'`, + }); + } + if (!existsSync(gradingPath)) { diagnostics.push({ severity: 'error', - message: `${testId}: benchmark.json not found at '${entry.benchmark_path}'`, + message: `${testId}: grading.json not found at '${path.posix.join(entry.artifact_dir, trial.run_path, 'grading.json')}'`, }); } } @@ -269,10 +281,10 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[] } } - // Check benchmark.json - const benchmarkPath = path.join(runDir, 'benchmark.json'); - if (!existsSync(benchmarkPath)) { - diagnostics.push({ severity: 'warning', message: 'benchmark.json is missing' }); + // Check run summary.json + const summaryPath = path.join(runDir, 'summary.json'); + if (!existsSync(summaryPath)) { + diagnostics.push({ severity: 'error', message: 'summary.json is missing' }); } return diagnostics; diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts index 40b00d9a1..9ef6034ca 100644 --- a/apps/cli/test/commands/eval/aggregate.test.ts +++ b/apps/cli/test/commands/eval/aggregate.test.ts @@ -115,7 +115,7 @@ describe('aggregateRunDir', () => { rmSync(tmpDir, { recursive: true, force: true }); }); - it('reads index.jsonl, deduplicates, writes benchmark.json and timing.json', async () => { + it('reads index.jsonl, deduplicates, and writes summary.json with timing rollups', async () => { writeJsonlIndex(tmpDir, [ { testId: 'a', target: 'x', score: 0.1, executionStatus: 'execution_error' }, { testId: 'a', target: 'x', score: 0.9, executionStatus: 'ok' }, @@ -126,13 +126,11 @@ describe('aggregateRunDir', () => { expect(result.testCount).toBe(2); expect(result.targetCount).toBe(1); - const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8')); - expect(benchmark.metadata.tests_run).toContain('a'); - expect(benchmark.metadata.tests_run).toContain('b'); - expect(benchmark.run_summary.x).toBeDefined(); - - const timing = JSON.parse(readFileSync(result.timingPath, 'utf8')); - expect(timing.total_tokens).toBeGreaterThanOrEqual(0); + const summary = JSON.parse(readFileSync(result.summaryPath, 'utf8')); + expect(summary.metadata.tests_run).toContain('a'); + expect(summary.metadata.tests_run).toContain('b'); + expect(summary.run_summary.x).toBeDefined(); + expect(summary.timing.total_tokens).toBeGreaterThanOrEqual(0); }); it('uses last entry for duplicates in benchmark stats', async () => { @@ -144,7 +142,7 @@ describe('aggregateRunDir', () => { const result = await aggregateRunDir(tmpDir); expect(result.testCount).toBe(1); - const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8')); + const benchmark = JSON.parse(readFileSync(result.summaryPath, 'utf8')); // Should have 100% pass rate since the last entry is ok with score 1.0 expect(benchmark.run_summary.x.pass_rate.mean).toBe(1); }); @@ -181,13 +179,19 @@ describe('writePerTestArtifacts', () => { await writePerTestArtifacts(results, tmpDir); - const grading1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'grading.json'), 'utf8')); + const grading1 = JSON.parse( + readFileSync(path.join(tmpDir, 'test-1', 'run-1', 'grading.json'), 'utf8'), + ); expect(grading1.assertions).toHaveLength(1); - const timing1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'timing.json'), 'utf8')); + const timing1 = JSON.parse( + readFileSync(path.join(tmpDir, 'test-1', 'run-1', 'timing.json'), 'utf8'), + ); expect(timing1.total_tokens).toBeGreaterThanOrEqual(0); - const grading2 = JSON.parse(readFileSync(path.join(tmpDir, 'test-2', 'grading.json'), 'utf8')); + const grading2 = JSON.parse( + readFileSync(path.join(tmpDir, 'test-2', 'run-1', 'grading.json'), 'utf8'), + ); expect(grading2.assertions).toHaveLength(1); }); @@ -196,7 +200,10 @@ describe('writePerTestArtifacts', () => { await writePerTestArtifacts(results, tmpDir); - const answer = readFileSync(path.join(tmpDir, 'test-1', 'outputs', 'answer.md'), 'utf8'); + const answer = readFileSync( + path.join(tmpDir, 'test-1', 'run-1', 'outputs', 'answer.md'), + 'utf8', + ); expect(answer).toContain('hello'); }); }); diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 3e2feac68..bf30802d4 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -26,14 +26,14 @@ import { import { type AggregateGradingArtifact, - type BenchmarkArtifact, type GradingArtifact, type IndexArtifactEntry, + type RunSummaryArtifact, type TimingArtifact, buildAggregateGradingArtifact, - buildBenchmarkArtifact, buildGradingArtifact, buildIndexArtifactEntry, + buildRunSummaryArtifact, buildTimingArtifact, parseJsonlResults, writeArtifacts, @@ -336,7 +336,7 @@ describe('buildTimingArtifact', () => { // Benchmark artifact // --------------------------------------------------------------------------- -describe('buildBenchmarkArtifact', () => { +describe('buildRunSummaryArtifact', () => { it('computes per-target statistics', () => { const results = [ makeResult({ target: 'gpt-4', score: 0.9, durationMs: 30000 }), @@ -344,7 +344,7 @@ describe('buildBenchmarkArtifact', () => { makeResult({ target: 'claude', score: 0.5, durationMs: 45000 }), ]; - const benchmark = buildBenchmarkArtifact(results, 'test.eval.yaml'); + const benchmark = buildRunSummaryArtifact(results, 'test.eval.yaml'); expect(benchmark.metadata.eval_file).toBe('test.eval.yaml'); expect(benchmark.metadata.targets).toEqual(['claude', 'gpt-4']); @@ -371,7 +371,7 @@ describe('buildBenchmarkArtifact', () => { }), ]; - const benchmark = buildBenchmarkArtifact(results); + const benchmark = buildRunSummaryArtifact(results); expect(benchmark.per_grader_summary).toBeDefined(); expect(benchmark.per_grader_summary?.['quality:llm-grader'].mean).toBe(0.8); @@ -380,7 +380,7 @@ describe('buildBenchmarkArtifact', () => { it('adds note when execution errors present', () => { const results = [makeResult({ executionStatus: 'execution_error', score: 0 })]; - const benchmark = buildBenchmarkArtifact(results); + const benchmark = buildRunSummaryArtifact(results); expect(benchmark.notes).toContain( '1 test(s) had execution errors and are excluded from quality pass_rate', ); @@ -401,14 +401,14 @@ describe('buildBenchmarkArtifact', () => { }), ]; - const benchmark = buildBenchmarkArtifact(results); + const benchmark = buildRunSummaryArtifact(results); expect(benchmark.run_summary['test-target'].pass_rate.mean).toBe(1); expect(benchmark.per_grader_summary?.['quality:llm-grader'].mean).toBe(1); }); it('handles empty results', () => { - const benchmark = buildBenchmarkArtifact([]); + const benchmark = buildRunSummaryArtifact([]); expect(benchmark.metadata.targets).toEqual([]); expect(benchmark.metadata.tests_run).toEqual([]); @@ -418,7 +418,7 @@ describe('buildBenchmarkArtifact', () => { it('includes cost_usd when available', () => { const results = [makeResult({ costUsd: 0.05 }), makeResult({ testId: 'test-2', costUsd: 0.1 })]; - const benchmark = buildBenchmarkArtifact(results); + const benchmark = buildRunSummaryArtifact(results); const summary = benchmark.run_summary['test-target']; expect(summary.cost_usd).toBeDefined(); expect(summary.cost_usd?.mean).toBe(0.075); @@ -576,11 +576,10 @@ describe('buildIndexArtifactEntry', () => { }), { outputDir: '/tmp/artifacts', - gradingPath: '/tmp/artifacts/alpha/grading.json', - timingPath: '/tmp/artifacts/alpha/timing.json', + gradingPath: '/tmp/artifacts/alpha/run-1/grading.json', + timingPath: '/tmp/artifacts/alpha/run-1/timing.json', outputPath: '/tmp/artifacts/alpha/outputs/answer.md', answerPath: '/tmp/artifacts/alpha/outputs/answer.md', - inputPath: '/tmp/artifacts/alpha/task/PROMPT.md', }, ); @@ -608,11 +607,32 @@ describe('buildIndexArtifactEntry', () => { ], execution_status: 'quality_failure', error: 'model drift', - grading_path: 'alpha/grading.json', - timing_path: 'alpha/timing.json', + grading_path: 'alpha/run-1/grading.json', + timing_path: 'alpha/run-1/timing.json', output_path: 'alpha/outputs/answer.md', answer_path: 'alpha/outputs/answer.md', - input_path: 'alpha/task/PROMPT.md', + trials: [ + { + attempt: 0, + run_path: 'run-1', + score: 0.9, + verdict: 'pass', + scores: [ + { + name: 'quality', + type: 'llm-grader', + score: 0.7, + assertions: [ + { text: 'criterion-a', passed: true }, + { text: 'criterion-b', passed: false }, + ], + }, + ], + error: 'model drift', + cost_usd: 0.25, + execution_status: 'quality_failure', + }, + ], }); }); @@ -633,8 +653,8 @@ describe('buildIndexArtifactEntry', () => { }), { outputDir: '/tmp/artifacts', - gradingPath: '/tmp/artifacts/alpha/grading.json', - timingPath: '/tmp/artifacts/alpha/timing.json', + gradingPath: '/tmp/artifacts/alpha/run-1/grading.json', + timingPath: '/tmp/artifacts/alpha/run-1/timing.json', }, ); @@ -698,9 +718,9 @@ describe('parseJsonlResults', () => { artifactPointers: { transcript: { ref: 'agentv/artifacts/v1', - key: 'transcripts/pointer-row/transcript.jsonl', + key: 'transcripts/pointer-row/run-1/transcript-raw.jsonl', object_version: 'sha256:test', - path: 'pointer-row/transcript.jsonl', + path: 'pointer-row/run-1/transcript-raw.jsonl', sha256: 'test', size: 1, schema_version: 'agentv.transcript.v1', @@ -719,7 +739,7 @@ describe('parseJsonlResults', () => { target: 'codex', score: 1, output: 'done', - raw_provider_log_path: 'raw-log-case/provider.log', + raw_provider_log_path: 'raw-log-case/run-1/provider.log', })}\n`; const results = parseJsonlResults(content); @@ -815,7 +835,7 @@ describe('schema compatibility', () => { }); it('benchmark run_summary has pass_rate/time_seconds/tokens with mean/stddev', () => { - const benchmark = buildBenchmarkArtifact([makeResult({})]); + const benchmark = buildRunSummaryArtifact([makeResult({})]); const summary = benchmark.run_summary['test-target']; expect(summary).toBeDefined(); @@ -843,7 +863,7 @@ describe('writeArtifactsFromResults', () => { await rm(testDir, { recursive: true, force: true }).catch(() => undefined); }); - it('writes grading, timing, and benchmark files', async () => { + it('writes summary, index, and per-run artifact files', async () => { const results = [ makeResult({ testId: 'alpha', score: 0.9, durationMs: 5000 }), makeResult({ testId: 'beta', score: 0.6, durationMs: 8000 }), @@ -855,54 +875,47 @@ describe('writeArtifactsFromResults', () => { // Check per-test artifact directories const artifactEntries = await readdir(paths.testArtifactDir); - expect(artifactEntries.sort()).toEqual([ - 'alpha', - 'benchmark.json', - 'beta', - 'index.jsonl', - 'timing.json', - ]); + expect(artifactEntries.sort()).toEqual(['alpha', 'beta', 'index.jsonl', 'summary.json']); const alphaEntries = await readdir(path.join(paths.testArtifactDir, 'alpha')); - expect(alphaEntries.sort()).toEqual([ + expect(alphaEntries.sort()).toEqual(['run-1', 'summary.json']); + + const alphaRunEntries = await readdir(path.join(paths.testArtifactDir, 'alpha', 'run-1')); + expect(alphaRunEntries.sort()).toEqual([ 'grading.json', 'metrics.json', 'outputs', + 'result.json', 'timing.json', - 'trace.json', - 'transcript.jsonl', + 'transcript-raw.jsonl', + 'transcript.json', ]); const alphaGrading: GradingArtifact = JSON.parse( - await readFile(path.join(paths.testArtifactDir, 'alpha', 'grading.json'), 'utf8'), + await readFile(path.join(paths.testArtifactDir, 'alpha', 'run-1', 'grading.json'), 'utf8'), ); expect(alphaGrading.summary).toBeDefined(); expect(alphaGrading).not.toHaveProperty('execution_metrics'); const alphaTiming: TimingArtifact = JSON.parse( - await readFile(path.join(paths.testArtifactDir, 'alpha', 'timing.json'), 'utf8'), + await readFile(path.join(paths.testArtifactDir, 'alpha', 'run-1', 'timing.json'), 'utf8'), ); expect(alphaTiming.duration_ms).toBe(5000); - // Check timing - const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8')); - expect(timing.duration_ms).toBe(13000); - - // Check benchmark - const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8')); - expect(benchmark.metadata.eval_file).toBe('my-eval.yaml'); - expect(benchmark.metadata.tests_run.sort()).toEqual(['alpha', 'beta']); + const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); + expect(summary.metadata.eval_file).toBe('my-eval.yaml'); + expect(summary.metadata.tests_run.sort()).toEqual(['alpha', 'beta']); + expect(summary.timing.duration_ms).toBe(13000); const indexLines = (await readFile(paths.indexPath, 'utf8')) .trim() .split('\n') .map((line) => JSON.parse(line) as IndexArtifactEntry); expect(indexLines).toHaveLength(2); - expect(indexLines[0]?.grading_path).toBe('alpha/grading.json'); - expect(indexLines[0]?.timing_path).toBe('alpha/timing.json'); - expect(indexLines[0]?.trace_path).toBe('alpha/trace.json'); - expect(indexLines[0]?.transcript_path).toBe('alpha/transcript.jsonl'); - expect(indexLines[0]?.metrics_path).toBe('alpha/metrics.json'); + expect(indexLines[0]?.summary_path).toBe('alpha/summary.json'); + expect(indexLines[0]?.grading_path).toBe('alpha/run-1/grading.json'); + expect(indexLines[0]?.timing_path).toBe('alpha/run-1/timing.json'); + expect(indexLines[0]?.metrics_path).toBe('alpha/run-1/metrics.json'); }); it('writes repeat runs in Vercel-compatible case and run folders', async () => { @@ -977,27 +990,14 @@ describe('writeArtifactsFromResults', () => { }); expect(indexEntry?.artifact_dir).toBe('repeat-case'); expect(indexEntry?.summary_path).toBe('repeat-case/summary.json'); - expect(indexEntry?.task_dir).toBe('repeat-case/task'); - expect(indexEntry?.input_path).toBe('repeat-case/task/PROMPT.md'); - expect(indexEntry?.benchmark_path).toBeUndefined(); - expect(indexEntry?.grading_path).toBe('repeat-case/grading.json'); + expect(indexEntry?.task_dir).toBeUndefined(); + expect(indexEntry?.input_path).toBeUndefined(); + expect(indexEntry?.grading_path).toBeUndefined(); expect(indexEntry?.timing_path).toBeUndefined(); expect(indexEntry?.metrics_path).toBeUndefined(); const repeatEntries = await readdir(path.join(paths.testArtifactDir, 'repeat-case')); - expect(repeatEntries.sort()).toEqual([ - 'grading.json', - 'run-1', - 'run-2', - 'summary.json', - 'task', - ]); - - const prompt = await readFile( - path.join(paths.testArtifactDir, 'repeat-case', 'task', 'PROMPT.md'), - 'utf8', - ); - expect(prompt).toBe('@[user]:\nRepeat this task prompt.'); + expect(repeatEntries.sort()).toEqual(['run-1', 'run-2', 'summary.json']); const caseSummary = JSON.parse( await readFile(path.join(paths.testArtifactDir, 'repeat-case', 'summary.json'), 'utf8'), @@ -1031,18 +1031,18 @@ describe('writeArtifactsFromResults', () => { }); expect(typeof caseSummary.fingerprint).toBe('string'); - const aggregateGrading: GradingArtifact = JSON.parse( - await readFile(path.join(paths.testArtifactDir, 'repeat-case', 'grading.json'), 'utf8'), - ); - expect(aggregateGrading.trials).toEqual(indexEntry?.trials); - expect(aggregateGrading.aggregation).toEqual(indexEntry?.aggregation); + await expect( + readFile(path.join(paths.testArtifactDir, 'repeat-case', 'grading.json'), 'utf8'), + ).rejects.toThrow(); for (const runDir of ['run-1', 'run-2']) { const runEntries = await readdir(path.join(paths.testArtifactDir, 'repeat-case', runDir)); expect(runEntries.sort()).toEqual([ 'grading.json', + 'metrics.json', 'outputs', 'result.json', + 'timing.json', 'transcript-raw.jsonl', 'transcript.json', ]); @@ -1094,13 +1094,11 @@ describe('writeArtifactsFromResults', () => { const paths = await writeArtifactsFromResults([], testDir); const artifactEntries = await readdir(paths.testArtifactDir); - expect(artifactEntries.sort()).toEqual(['benchmark.json', 'index.jsonl', 'timing.json']); + expect(artifactEntries.sort()).toEqual(['index.jsonl', 'summary.json']); - const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8')); - expect(timing.total_tokens).toBe(0); - - const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8')); - expect(benchmark.notes).toContain('No results to summarize'); + const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); + expect(summary.notes).toContain('No results to summarize'); + expect(summary.timing.total_tokens).toBe(0); expect(await readFile(paths.indexPath, 'utf8')).toBe(''); }); @@ -1122,13 +1120,13 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); const gradingOne: GradingArtifact = JSON.parse( - await readFile(path.join(testDir, 'test-1', 'grading.json'), 'utf8'), + await readFile(path.join(testDir, 'test-1', 'run-1', 'grading.json'), 'utf8'), ); const gradingTwo: GradingArtifact = JSON.parse( - await readFile(path.join(testDir, 'test-2', 'grading.json'), 'utf8'), + await readFile(path.join(testDir, 'test-2', 'run-1', 'grading.json'), 'utf8'), ); const timingOne: TimingArtifact = JSON.parse( - await readFile(path.join(testDir, 'test-1', 'timing.json'), 'utf8'), + await readFile(path.join(testDir, 'test-1', 'run-1', 'timing.json'), 'utf8'), ); expect(gradingOne.summary.total).toBe(1); @@ -1179,22 +1177,18 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); - const transcriptPath = path.join(testDir, 'transcript-case', 'transcript.jsonl'); + const transcriptPath = path.join(testDir, 'transcript-case', 'run-1', 'transcript-raw.jsonl'); const transcriptLines = (await readFile(transcriptPath, 'utf8')) .trim() .split('\n') .map((line) => JSON.parse(line)); - const envelope = TraceEnvelopeWireSchema.parse( - JSON.parse(await readFile(path.join(testDir, 'transcript-case', 'trace.json'), 'utf8')), + const transcriptMessages = JSON.parse( + await readFile(path.join(testDir, 'transcript-case', 'run-1', 'transcript.json'), 'utf8'), ); - const projectedEnvelope = fromTraceEnvelopeWire(envelope); - const projectedTranscript = traceEnvelopeToTranscriptJsonLines(projectedEnvelope, { - testId: 'transcript-case', - target: 'codex', - }); - expect(transcriptLines).toEqual(JSON.parse(JSON.stringify(projectedTranscript))); + expect(Array.isArray(transcriptMessages)).toBe(true); + expect(transcriptMessages).toHaveLength(2); expect(transcriptLines).toHaveLength(2); expect(transcriptLines[0]).toMatchObject({ schema_version: 'agentv.transcript.v1', @@ -1209,9 +1203,9 @@ describe('writeArtifactsFromResults', () => { capture: { content: 'full', redaction_level: 'none', redacted_fields: [] }, trace: { schema_version: 'agentv.trace.v1', - artifact_id: envelope.artifact_id, - trace_id: envelope.trace.trace_id, - span_id: envelope.trace.root_span_id, + artifact_id: expect.any(String), + trace_id: expect.any(String), + span_id: expect.any(String), }, source: { kind: 'agentv_run', @@ -1242,8 +1236,8 @@ describe('writeArtifactsFromResults', () => { status: 'ok', trace: { schema_version: 'agentv.trace.v1', - artifact_id: envelope.artifact_id, - trace_id: envelope.trace.trace_id, + artifact_id: expect.any(String), + trace_id: expect.any(String), }, }, ], @@ -1257,18 +1251,6 @@ describe('writeArtifactsFromResults', () => { expect(transcriptLines[1].tool_calls[0].trace.span_id).toBeTruthy(); expect(transcriptLines[1]).not.toHaveProperty('provider_session_id'); expect(transcriptLines[1]).not.toHaveProperty('providerSessionId'); - expect(envelope.schema_version).toBe('agentv.trace.v1'); - expect(envelope.artifact_id).toMatch(/^execution-trace-/); - expect(envelope.artifacts.trace_path).toBe(CANONICAL_TRACE_ARTIFACT_PATH); - expect(envelope.artifacts.transcript_path).toBe(CANONICAL_TRANSCRIPT_ARTIFACT_PATH); - expect(envelope.artifacts.metrics_path).toBe(CANONICAL_METRICS_ARTIFACT_PATH); - expect(envelope.artifacts).not.toHaveProperty('execution_trace_path'); - expect(envelope.eval.test_id).toBe('transcript-case'); - expect(envelope.trace.spans.map((span) => span.attributes['gen_ai.operation.name'])).toEqual([ - 'invoke_agent', - 'chat', - 'execute_tool', - ]); await expect( readFile(path.join(testDir, 'transcript-case', 'transcript.json'), 'utf8'), ).rejects.toThrow(); @@ -1276,40 +1258,12 @@ describe('writeArtifactsFromResults', () => { const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); - expect(indexLine.trace_path).toBe('transcript-case/trace.json'); - expect(indexLine.transcript_path).toBe('transcript-case/transcript.jsonl'); - expect(indexLine.transcript_path.endsWith(CANONICAL_TRANSCRIPT_ARTIFACT_PATH)).toBe(true); - expect(indexLine.metrics_path).toBe('transcript-case/metrics.json'); + expect(indexLine).not.toHaveProperty('trace_path'); + expect(indexLine.transcript_path).toBe('transcript-case/run-1/transcript-raw.jsonl'); + expect(indexLine.metrics_path).toBe('transcript-case/run-1/metrics.json'); expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true); - const traceContent = await readFile(path.join(testDir, 'transcript-case', 'trace.json')); - const transcriptContent = await readFile(transcriptPath); - const traceSha = sha256Hex(traceContent); - const transcriptSha = sha256Hex(transcriptContent); - - expect(indexLine.artifact_pointers.trace).toMatchObject({ - ref: AGENTV_RESULTS_ARTIFACTS_REF, - key: 'traces/transcript-case/trace.json', - object_version: `sha256:${traceSha}`, - path: 'transcript-case/trace.json', - sha256: traceSha, - size: traceContent.byteLength, - schema_version: EXECUTION_TRACE_SCHEMA_VERSION, - media_type: TRACE_JSON_MEDIA_TYPE, - family: 'traces', - }); - expect(indexLine.artifact_pointers.transcript).toMatchObject({ - ref: AGENTV_RESULTS_ARTIFACTS_REF, - key: 'transcripts/transcript-case/transcript.jsonl', - object_version: `sha256:${transcriptSha}`, - path: 'transcript-case/transcript.jsonl', - sha256: transcriptSha, - size: transcriptContent.byteLength, - schema_version: TRANSCRIPT_SCHEMA_VERSION, - media_type: TRANSCRIPT_JSONL_MEDIA_TYPE, - family: 'transcripts', - }); - expect(indexLine.artifact_pointers).not.toHaveProperty('metrics'); + expect(indexLine.artifact_pointers).toBeUndefined(); }); it('writes AgentV metrics as Agent Skills and Vercel-style behavior projections', async () => { @@ -1406,16 +1360,18 @@ describe('writeArtifactsFromResults', () => { const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); - expect(indexLine.metrics_path).toBe('summary-case/metrics.json'); + expect(indexLine.metrics_path).toBe('summary-case/run-1/metrics.json'); const summary = MetricsArtifactWireSchema.parse( - JSON.parse(await readFile(path.join(testDir, 'summary-case', 'metrics.json'), 'utf8')), + JSON.parse( + await readFile(path.join(testDir, 'summary-case', 'run-1', 'metrics.json'), 'utf8'), + ), ); expect(summary.schema_version).toBe(METRICS_SCHEMA_VERSION); expect(summary.source_artifacts).toMatchObject({ - trace_path: CANONICAL_TRACE_ARTIFACT_PATH, - transcript_path: CANONICAL_TRANSCRIPT_ARTIFACT_PATH, + trace_path: 'transcript.json', + transcript_path: 'transcript-raw.jsonl', grading_path: 'grading.json', timing_path: 'timing.json', }); @@ -1490,7 +1446,7 @@ describe('writeArtifactsFromResults', () => { expect(summary).not.toHaveProperty('usage_summary'); const timing = JSON.parse( - await readFile(path.join(testDir, 'summary-case', 'timing.json'), 'utf8'), + await readFile(path.join(testDir, 'summary-case', 'run-1', 'timing.json'), 'utf8'), ); expect(timing).toMatchObject({ total_tokens: 140, @@ -1545,18 +1501,22 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); const aggregateTiming = JSON.parse( - await readFile(path.join(testDir, 'aggregate-usage', 'timing.json'), 'utf8'), + await readFile(path.join(testDir, 'aggregate-usage', 'run-1', 'timing.json'), 'utf8'), ); const estimatedTiming = JSON.parse( - await readFile(path.join(testDir, 'estimated-usage', 'timing.json'), 'utf8'), + await readFile(path.join(testDir, 'estimated-usage', 'run-1', 'timing.json'), 'utf8'), ); - const runTiming = JSON.parse(await readFile(path.join(testDir, 'timing.json'), 'utf8')); + const runSummary = JSON.parse(await readFile(path.join(testDir, 'summary.json'), 'utf8')); MetricsArtifactWireSchema.parse( - JSON.parse(await readFile(path.join(testDir, 'aggregate-usage', 'metrics.json'), 'utf8')), + JSON.parse( + await readFile(path.join(testDir, 'aggregate-usage', 'run-1', 'metrics.json'), 'utf8'), + ), ); MetricsArtifactWireSchema.parse( - JSON.parse(await readFile(path.join(testDir, 'estimated-usage', 'metrics.json'), 'utf8')), + JSON.parse( + await readFile(path.join(testDir, 'estimated-usage', 'run-1', 'metrics.json'), 'utf8'), + ), ); expect(aggregateTiming).toMatchObject({ @@ -1581,7 +1541,7 @@ describe('writeArtifactsFromResults', () => { duration: 'unavailable', }, }); - expect(runTiming).toMatchObject({ + expect(runSummary.timing).toMatchObject({ total_tokens: 20, cost_usd: 0.002, usage_sources: { @@ -1614,10 +1574,10 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); - const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'provider.log'); + const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'run-1', 'provider.log'); expect(await readFile(copiedRawLogPath, 'utf8')).toBe(rawLog); - const transcriptPath = path.join(testDir, 'raw-log-case', 'transcript.jsonl'); + const transcriptPath = path.join(testDir, 'raw-log-case', 'run-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).resolves.toContain( '"schema_version":"agentv.transcript.v1"', ); @@ -1625,17 +1585,16 @@ describe('writeArtifactsFromResults', () => { readFile(path.join(testDir, 'raw-log-case', 'transcript.json'), 'utf8'), ).rejects.toThrow(); - const envelope = TraceEnvelopeWireSchema.parse( - JSON.parse(await readFile(path.join(testDir, 'raw-log-case', 'trace.json'), 'utf8')), + const transcriptMessages = JSON.parse( + await readFile(path.join(testDir, 'raw-log-case', 'run-1', 'transcript.json'), 'utf8'), ); - expect(envelope.artifacts.raw_provider_log_path).toBe('provider.log'); - expect(envelope.artifacts.transcript_path).toBe('transcript.jsonl'); + expect(Array.isArray(transcriptMessages)).toBe(true); const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); - expect(indexLine.raw_provider_log_path).toBe('raw-log-case/provider.log'); - expect(indexLine.transcript_path).toBe('raw-log-case/transcript.jsonl'); + expect(indexLine.raw_provider_log_path).toBeUndefined(); + expect(indexLine.transcript_path).toBe('raw-log-case/run-1/transcript-raw.jsonl'); expect(indexLine).not.toHaveProperty('transcript_json_path'); }); @@ -1681,13 +1640,12 @@ describe('writeArtifactsFromResults', () => { expect(JSON.stringify(indexLine)).not.toContain('secret'); expect(JSON.stringify(indexLine)).not.toContain('api_key'); - const envelope = TraceEnvelopeWireSchema.parse( - JSON.parse(await readFile(path.join(testDir, 'external-trace-case', 'trace.json'), 'utf8')), + const transcriptJson = await readFile( + path.join(testDir, 'external-trace-case', 'run-1', 'transcript.json'), + 'utf8', ); - expect(envelope.external_trace).toEqual(indexLine.external_trace); - expect(envelope.source.metadata ?? {}).not.toHaveProperty('external_trace'); - expect(JSON.stringify(envelope)).not.toContain('secret'); - expect(JSON.stringify(envelope)).not.toContain('api_key'); + expect(transcriptJson).not.toContain('secret'); + expect(transcriptJson).not.toContain('api_key'); }); it('omits per-test transcript links when the execution trace has no transcript rows', async () => { @@ -1701,30 +1659,20 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); - const transcriptPath = path.join(testDir, 'no-transcript-case', 'transcript.jsonl'); + const transcriptPath = path.join( + testDir, + 'no-transcript-case', + 'run-1', + 'transcript-raw.jsonl', + ); await expect(readFile(transcriptPath, 'utf8')).rejects.toThrow(); const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); expect(indexLine).not.toHaveProperty('transcript_path'); - expect(indexLine.metrics_path).toBe('no-transcript-case/metrics.json'); - expect(indexLine.artifact_pointers.trace).toMatchObject({ - ref: AGENTV_RESULTS_ARTIFACTS_REF, - key: 'traces/no-transcript-case/trace.json', - path: 'no-transcript-case/trace.json', - schema_version: EXECUTION_TRACE_SCHEMA_VERSION, - media_type: TRACE_JSON_MEDIA_TYPE, - family: 'traces', - }); - expect(indexLine.artifact_pointers).not.toHaveProperty('transcript'); - expect(indexLine.artifact_pointers).not.toHaveProperty('metrics'); - - const envelope = TraceEnvelopeWireSchema.parse( - JSON.parse(await readFile(path.join(testDir, 'no-transcript-case', 'trace.json'), 'utf8')), - ); - expect(envelope.artifacts).not.toHaveProperty('transcript_path'); - expect(envelope.artifacts.metrics_path).toBe(CANONICAL_METRICS_ARTIFACT_PATH); + expect(indexLine.metrics_path).toBe('no-transcript-case/run-1/metrics.json'); + expect(indexLine.artifact_pointers).toBeUndefined(); }); it('sanitizes test IDs for directory names', async () => { @@ -1749,10 +1697,10 @@ describe('writeArtifactsFromResults', () => { const paths = await writeArtifactsFromResults(results, testDir); const indexLines = (await readFile(paths.indexPath, 'utf8')).trim().split('\n').map(JSON.parse); - expect(indexLines[0].grading_path).toBe('shared-id/grading.json'); + expect(indexLines[0].grading_path).toBe('shared-id/run-1/grading.json'); const grading: GradingArtifact = JSON.parse( - await readFile(path.join(testDir, 'shared-id', 'grading.json'), 'utf8'), + await readFile(path.join(testDir, 'shared-id', 'run-1', 'grading.json'), 'utf8'), ); expect(grading.assertions[0].text).toBe('baseline-check'); @@ -1768,7 +1716,7 @@ describe('writeArtifactsFromResults', () => { .trim() .split('\n') .map(JSON.parse); - expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/grading.json'); + expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json'); }); it('writes task bundle artifacts with local source paths when source metadata is provided', async () => { @@ -2078,8 +2026,8 @@ describe('writeArtifacts (from JSONL file)', () => { expect(artifactEntries).toContain('from-file'); expect(artifactEntries).toContain('index.jsonl'); - const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8')); - expect(timing.duration_ms).toBe(12000); - expect(timing.total_tokens).toBe(700); + const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); + expect(summary.timing.duration_ms).toBe(12000); + expect(summary.timing.total_tokens).toBe(700); }); }); diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts index 8514f5a96..35ebec80e 100644 --- a/apps/cli/test/commands/eval/pipeline/bench.test.ts +++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts @@ -85,12 +85,12 @@ describe('pipeline bench', () => { expect(lines[0].test_id).toBe('test-01'); expect(lines[0].score).toBeGreaterThan(0); - const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); + const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'summary.json'), 'utf8')); expect(benchmark.metadata.targets).toContain('test-target'); expect(benchmark.run_summary['test-target']).toBeDefined(); }, 30_000); - it('propagates experiment from manifest to index.jsonl and benchmark.json', async () => { + it('propagates experiment from manifest to index.jsonl and summary.json', async () => { // Overwrite manifest with experiment field await writeFile( join(OUT_DIR, 'manifest.json'), @@ -110,7 +110,7 @@ describe('pipeline bench', () => { const entry = JSON.parse(indexContent.trim().split('\n')[0]); expect(entry.experiment).toBe('without_skills'); - const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); + const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'summary.json'), 'utf8')); expect(benchmark.metadata.experiment).toBe('without_skills'); }, 30_000); @@ -122,7 +122,7 @@ describe('pipeline bench', () => { const entry = JSON.parse(indexContent.trim().split('\n')[0]); expect(entry.experiment).toBeUndefined(); - const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); + const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'summary.json'), 'utf8')); expect(benchmark.metadata.experiment).toBeUndefined(); }, 30_000); }); diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts index a2e695859..fef9a62cb 100644 --- a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts +++ b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts @@ -69,7 +69,7 @@ describe('eval pipeline e2e', () => { expect(indexLines).toHaveLength(1); expect(indexLines[0].test_id).toBe('test-01'); - const benchmark = JSON.parse(await readFile(join(outDir, 'benchmark.json'), 'utf8')); + const benchmark = JSON.parse(await readFile(join(outDir, 'summary.json'), 'utf8')); expect(benchmark.run_summary).toBeDefined(); }, PIPELINE_E2E_TIMEOUT_MS, diff --git a/apps/cli/test/commands/results/combine.test.ts b/apps/cli/test/commands/results/combine.test.ts index 618a628c9..bed14923f 100644 --- a/apps/cli/test/commands/results/combine.test.ts +++ b/apps/cli/test/commands/results/combine.test.ts @@ -90,7 +90,7 @@ describe('results combine', () => { expect( existsSync(path.join(combined.runDir, 'sources/source-1/demo/test-a/grading.json')), ).toBe(true); - const benchmark = JSON.parse(readFileSync(combined.benchmarkPath, 'utf8')) as { + const benchmark = JSON.parse(readFileSync(combined.summaryPath, 'utf8')) as { metadata: { timestamp: string }; }; expect(benchmark.metadata.timestamp).toBe('2026-06-01T10:00:00.000Z'); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index cb1b2eee6..46f1a2c58 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -11,8 +11,8 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import type { - BenchmarkArtifact, GradingArtifact, + RunSummaryArtifact, TimingArtifact, } from '../../../src/commands/eval/artifact-writer.js'; import { exportResults } from '../../../src/commands/results/export.js'; @@ -215,6 +215,10 @@ function artifactDir(outputDir: string, record: { suite?: string; test_id?: stri return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId); } +function runArtifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { + return path.join(artifactDir(outputDir, record), 'run-1'); +} + describe('export e2e — multi-provider metrics verification', () => { let tempDir: string; @@ -236,7 +240,10 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const timing: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'), + readFileSync( + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + 'utf8', + ), ); expect(timing.token_usage.input).toBe(2000); @@ -251,13 +258,16 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const claudeTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'), + readFileSync( + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + 'utf8', + ), ); const codexTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'), ); const copilotTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'), ); expect(claudeTiming.token_usage.reasoning).toBe(1500); @@ -272,7 +282,10 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const timing: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'), + readFileSync( + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + 'utf8', + ), ); expect(timing.total_tokens).toBe(2800); @@ -285,7 +298,7 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const timing: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'), ); expect(timing.duration_ms).toBe(12000); @@ -299,7 +312,7 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const timing: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, MINIMAL_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'timing.json'), 'utf8'), ); expect(timing.total_tokens).toBe(0); @@ -316,10 +329,13 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const claudeTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), 'utf8'), + readFileSync( + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + 'utf8', + ), ); const copilotTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'), ); expect(claudeTiming.token_usage.reasoning).toBe(1500); @@ -329,7 +345,7 @@ describe('export e2e — multi-provider metrics verification', () => { // ── Benchmark artifact tests ─────────────────────────────────────────── - describe('benchmark.json — per-target summary', () => { + describe('summary.json — per-target summary', () => { it('should group results by target with correct pass rates', async () => { const outputDir = path.join(tempDir, 'benchmark'); const content = toJsonl( @@ -343,8 +359,8 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); // All 6 targets should be represented @@ -362,8 +378,8 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); // claude: 8500ms = 8.5s @@ -378,8 +394,8 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); // claude: 2000 + 800 = 2800 @@ -394,8 +410,8 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); expect(benchmark.run_summary['claude-cli'].cost_usd).toBeDefined(); @@ -409,8 +425,8 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); // Claude has 3 tool calls in trace steps @@ -424,8 +440,8 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); expect(benchmark.notes.length).toBeGreaterThan(0); @@ -438,8 +454,8 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); expect(benchmark.per_grader_summary).toBeDefined(); @@ -456,7 +472,10 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const grading: GradingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json'), 'utf8'), + readFileSync( + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json'), + 'utf8', + ), ); expect(grading.assertions).toHaveLength(2); @@ -467,7 +486,10 @@ describe('export e2e — multi-provider metrics verification', () => { expect(grading.summary.pass_rate).toBe(1.0); const metrics = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'), 'utf8'), + readFileSync( + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'), + 'utf8', + ), ); expect(metrics.metrics.total_tool_calls).toBe(3); expect(metrics.metrics.tool_call_counts.Read).toBe(2); @@ -485,7 +507,7 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const grading: GradingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'grading.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'grading.json'), 'utf8'), ); expect(grading.summary.passed).toBe(1); @@ -493,7 +515,7 @@ describe('export e2e — multi-provider metrics verification', () => { expect(grading.summary.pass_rate).toBe(0.5); const metrics = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'), ); expect(metrics.metrics.total_tool_calls).toBe(0); }); @@ -505,14 +527,14 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const grading: GradingArtifact = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'grading.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'grading.json'), 'utf8'), ); // Error result has empty assertions expect(grading.summary.total).toBe(0); expect(grading.summary.pass_rate).toBe(0); const metrics = JSON.parse( - readFileSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'metrics.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'metrics.json'), 'utf8'), ); expect(metrics.metrics.errors_encountered).toBe(1); }); @@ -523,10 +545,10 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); - expect(existsSync(path.join(artifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json'))).toBe( - true, - ); - expect(existsSync(path.join(artifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe( + expect( + existsSync(path.join(runArtifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json')), + ).toBe(true); + expect(existsSync(path.join(runArtifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe( true, ); }); @@ -543,21 +565,21 @@ describe('export e2e — multi-provider metrics verification', () => { expect( readFileSync( - path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'answer.md'), + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'outputs', 'answer.md'), 'utf8', ), ).toBe('The answer is 42, derived through extended thinking.'); expect( readFileSync( - path.join(artifactDir(outputDir, CODEX_RESULT), 'outputs', 'answer.md'), + path.join(runArtifactDir(outputDir, CODEX_RESULT), 'outputs', 'answer.md'), 'utf8', ), ).toBe('Applied the requested edit to src/main.ts.'); expect( readFileSync( - path.join(artifactDir(outputDir, COPILOT_RESULT), 'outputs', 'answer.md'), + path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'outputs', 'answer.md'), 'utf8', ), ).toBe('function add(a, b) { return a + b }'); @@ -570,7 +592,7 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); expect( - existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'outputs', 'answer.md')), + existsSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'outputs', 'answer.md')), ).toBe(false); }); }); @@ -594,12 +616,12 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('eval_2026-03-18.jsonl', content, outputDir); // Verify all artifact files exist - expect(existsSync(path.join(outputDir, 'benchmark.json'))).toBe(true); - expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(true); + expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true); + expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false); // Verify benchmark - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); // 7 unique targets (claude-cli appears twice with error result) @@ -607,26 +629,28 @@ describe('export e2e — multi-provider metrics verification', () => { expect(benchmark.metadata.eval_file).toBe('eval_2026-03-18.jsonl'); // Verify grading files - expect(existsSync(path.join(artifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json'))).toBe( - true, - ); - expect(existsSync(path.join(artifactDir(outputDir, CODEX_RESULT), 'grading.json'))).toBe( + expect( + existsSync(path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'grading.json')), + ).toBe(true); + expect(existsSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'grading.json'))).toBe( true, ); - expect(existsSync(path.join(artifactDir(outputDir, COPILOT_RESULT), 'grading.json'))).toBe( + expect(existsSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'grading.json'))).toBe( true, ); - expect(existsSync(path.join(artifactDir(outputDir, PI_RESULT), 'grading.json'))).toBe(true); - expect(existsSync(path.join(artifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json'))).toBe( + expect(existsSync(path.join(runArtifactDir(outputDir, PI_RESULT), 'grading.json'))).toBe( true, ); - expect(existsSync(path.join(artifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe( + expect( + existsSync(path.join(runArtifactDir(outputDir, LLM_AZURE_RESULT), 'grading.json')), + ).toBe(true); + expect(existsSync(path.join(runArtifactDir(outputDir, LLM_GPT_RESULT), 'grading.json'))).toBe( true, ); - expect(existsSync(path.join(artifactDir(outputDir, MINIMAL_RESULT), 'grading.json'))).toBe( + expect(existsSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'grading.json'))).toBe( true, ); - expect(existsSync(path.join(artifactDir(outputDir, ERROR_RESULT), 'grading.json'))).toBe( + expect(existsSync(path.join(runArtifactDir(outputDir, ERROR_RESULT), 'grading.json'))).toBe( true, ); }); @@ -660,7 +684,10 @@ describe('export e2e — multi-provider metrics verification', () => { const timing: TimingArtifact = JSON.parse( readFileSync( - path.join(artifactDir(outputDir, { ...record, target: 'mock' as const }), 'timing.json'), + path.join( + runArtifactDir(outputDir, { ...record, target: 'mock' as const }), + 'timing.json', + ), 'utf8', ), ); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index 9bb6b283a..8e7f58c31 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -4,9 +4,9 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import type { - BenchmarkArtifact, GradingArtifact, IndexArtifactEntry, + RunSummaryArtifact, TimingArtifact, } from '../../../src/commands/eval/artifact-writer.js'; import { parseJsonlResults } from '../../../src/commands/eval/artifact-writer.js'; @@ -168,6 +168,10 @@ function artifactDir(outputDir: string, record: { suite?: string; test_id?: stri return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId); } +function runArtifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { + return path.join(artifactDir(outputDir, record), 'run-1'); +} + function readIndex(outputDir: string): IndexArtifactEntry[] { return readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8') .trim() @@ -177,7 +181,7 @@ function readIndex(outputDir: string): IndexArtifactEntry[] { } function readAnswer(outputDir: string, record: { suite?: string; test_id?: string }): string { - return readFileSync(path.join(artifactDir(outputDir, record), 'outputs', 'answer.md'), 'utf8'); + return readFileSync(path.join(runArtifactDir(outputDir, record), 'outputs', 'answer.md'), 'utf8'); } describe('results export', () => { @@ -272,7 +276,7 @@ describe('results export', () => { }); expect(first.entries[0].artifact_refs).toMatchObject({ status: 'planned_export', - timing_path: 'privacy/test-private/timing.json', + timing_path: 'privacy/test-private/run-1/timing.json', }); expect(first.entries[0].artifact_refs).not.toHaveProperty('input_path'); expect(first.entries[0].artifact_refs).not.toHaveProperty('output_path'); @@ -349,14 +353,20 @@ describe('results export', () => { }); expect(bundle.entries[0].artifact_refs).toMatchObject({ status: 'planned_export', - input_path: 'privacy/test-private/task/PROMPT.md', - output_path: 'privacy/test-private/outputs/answer.md', - answer_path: 'privacy/test-private/outputs/answer.md', + artifact_dir: 'privacy/test-private', + summary_path: 'privacy/test-private/summary.json', + grading_path: 'privacy/test-private/run-1/grading.json', + timing_path: 'privacy/test-private/run-1/timing.json', + metrics_path: 'privacy/test-private/run-1/metrics.json', + output_path: 'privacy/test-private/run-1/outputs/answer.md', + answer_path: 'privacy/test-private/run-1/outputs/answer.md', + transcript_path: 'privacy/test-private/run-1/transcript-raw.jsonl', trace_path: 'privacy/test-private/trace.json', }); + expect(bundle.entries[0].artifact_refs).not.toHaveProperty('input_path'); expect(bundle.entries[0].trace.envelope_ref).toBe('privacy/test-private/trace.json'); expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined(); - expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/grading.json'); + expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/run-1/grading.json'); expect(bundle.entries[0].raw_content).toBeDefined(); expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence'); expect(serialized).toContain('SECRET_PROMPT_TEXT'); @@ -366,16 +376,16 @@ describe('results export', () => { expect(serialized).toContain('SECRET_SCORE_EVIDENCE'); }); - it('should create benchmark.json matching artifact-writer schema', async () => { + it('should create summary.json matching artifact-writer schema', async () => { const outputDir = path.join(tempDir, 'output'); const content = toJsonl(RESULT_FULL, RESULT_PARTIAL); await exportResults('eval_2026-03-18.jsonl', content, outputDir); - const benchmarkPath = path.join(outputDir, 'benchmark.json'); - expect(existsSync(benchmarkPath)).toBe(true); + const summaryPath = path.join(outputDir, 'summary.json'); + expect(existsSync(summaryPath)).toBe(true); - const benchmark: BenchmarkArtifact = JSON.parse(readFileSync(benchmarkPath, 'utf8')); + const benchmark: RunSummaryArtifact = JSON.parse(readFileSync(summaryPath, 'utf8')); expect(benchmark.metadata.eval_file).toBe('eval_2026-03-18.jsonl'); expect(benchmark.metadata.timestamp).toBe('2026-03-18T10:00:01.000Z'); // artifact-writer uses string[] for tests_run, not a count @@ -412,13 +422,16 @@ describe('results export', () => { test_id: 'test-greeting', target: 'gpt-4o', execution_status: 'ok', - grading_path: 'demo/test-greeting/grading.json', - timing_path: 'demo/test-greeting/timing.json', - output_path: 'demo/test-greeting/outputs/answer.md', - answer_path: 'demo/test-greeting/outputs/answer.md', - transcript_path: 'demo/test-greeting/transcript.jsonl', - input_path: 'demo/test-greeting/task/PROMPT.md', + artifact_dir: 'demo/test-greeting', + summary_path: 'demo/test-greeting/summary.json', + grading_path: 'demo/test-greeting/run-1/grading.json', + timing_path: 'demo/test-greeting/run-1/timing.json', + metrics_path: 'demo/test-greeting/run-1/metrics.json', + output_path: 'demo/test-greeting/run-1/outputs/answer.md', + answer_path: 'demo/test-greeting/run-1/outputs/answer.md', + transcript_path: 'demo/test-greeting/run-1/transcript-raw.jsonl', }); + expect(entries[0]).not.toHaveProperty('input_path'); expect(entries[0].projection_identity).toMatchObject({ schema_version: 'agentv.projection_identity.v1', dimensions: { @@ -551,7 +564,7 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); - const timingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'timing.json'); + const timingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json'); expect(existsSync(timingPath)).toBe(true); const timing: TimingArtifact = JSON.parse(readFileSync(timingPath, 'utf8')); @@ -568,7 +581,7 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); - const gradingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'grading.json'); + const gradingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'grading.json'); expect(existsSync(gradingPath)).toBe(true); const grading: GradingArtifact = JSON.parse(readFileSync(gradingPath, 'utf8')); @@ -596,7 +609,7 @@ describe('results export', () => { expect(grading.graders?.[0].name).toBe('greeting_quality'); expect(grading.graders?.[0].type).toBe('llm-grader'); - const perTestTimingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'timing.json'); + const perTestTimingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json'); expect(existsSync(perTestTimingPath)).toBe(true); }); @@ -606,22 +619,26 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); - const answerPath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'answer.md'); + const answerPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'outputs', 'answer.md'); expect(existsSync(answerPath)).toBe(true); expect(readFileSync(answerPath, 'utf8')).toBe('Hello, Alice!'); - const responsePath = path.join(artifactDir(outputDir, RESULT_FULL), 'outputs', 'response.md'); + const responsePath = path.join( + runArtifactDir(outputDir, RESULT_FULL), + 'outputs', + 'response.md', + ); expect(existsSync(responsePath)).toBe(false); }); - it('should group results by target in benchmark.json', async () => { + it('should group results by target in summary.json', async () => { const outputDir = path.join(tempDir, 'output'); const content = toJsonl(RESULT_FULL, RESULT_DIFFERENT_TARGET); await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); expect(benchmark.run_summary['gpt-4o']).toBeDefined(); @@ -644,14 +661,16 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); - expect(existsSync(path.join(outputDir, 'benchmark.json'))).toBe(true); + expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true); expect(existsSync(path.join(outputDir, 'index.jsonl'))).toBe(true); - expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(true); - expect(existsSync(path.join(artifactDir(outputDir, RESULT_FULL), 'grading.json'))).toBe(true); - expect(existsSync(path.join(artifactDir(outputDir, RESULT_PARTIAL), 'grading.json'))).toBe( + expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false); + expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_FULL), 'grading.json'))).toBe( true, ); - expect(existsSync(path.join(artifactDir(outputDir, RESULT_NO_TRACE), 'grading.json'))).toBe( + expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_PARTIAL), 'grading.json'))).toBe( + true, + ); + expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_NO_TRACE), 'grading.json'))).toBe( true, ); }); @@ -662,8 +681,8 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); expect(benchmark.per_grader_summary).toBeDefined(); @@ -677,6 +696,7 @@ describe('results export', () => { const answerPath = path.join( artifactDir(outputDir, RESULT_DIFFERENT_TARGET), + 'run-1', 'outputs', 'answer.md', ); @@ -702,7 +722,7 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); const gradingPath = path.join( - artifactDir(outputDir, { ...minimal, target: 'default' }), + runArtifactDir(outputDir, { ...minimal, target: 'default' }), 'grading.json', ); expect(existsSync(gradingPath)).toBe(true); @@ -712,7 +732,7 @@ describe('results export', () => { expect(grading.summary.total).toBe(0); }); - it('should write string input to /task/PROMPT.md', async () => { + it('should not write string input to a generated prompt sidecar', async () => { const outputDir = path.join(tempDir, 'output'); const resultWithInput = { ...RESULT_FULL, @@ -723,11 +743,11 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); const inputPath = path.join(artifactDir(outputDir, resultWithInput), 'task', 'PROMPT.md'); - expect(existsSync(inputPath)).toBe(true); - expect(readFileSync(inputPath, 'utf8')).toBe('What is the capital of France?'); + expect(existsSync(inputPath)).toBe(false); + expect(readIndex(outputDir)[0]).not.toHaveProperty('input_path'); }); - it('should write Message[] input to /task/PROMPT.md as markdown', async () => { + it('should not write Message[] input to a generated prompt sidecar', async () => { const outputDir = path.join(tempDir, 'output'); const resultWithMessages = { ...RESULT_FULL, @@ -741,8 +761,8 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); const inputPath = path.join(artifactDir(outputDir, resultWithMessages), 'task', 'PROMPT.md'); - expect(existsSync(inputPath)).toBe(true); - expect(readFileSync(inputPath, 'utf8')).toBe('@[user]:\nHello\n\n@[assistant]:\nHi there!'); + expect(existsSync(inputPath)).toBe(false); + expect(readIndex(outputDir)[0]).not.toHaveProperty('input_path'); }); it('should not create input file when input is missing', async () => { @@ -765,8 +785,8 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); - const benchmark: BenchmarkArtifact = JSON.parse( - readFileSync(path.join(outputDir, 'benchmark.json'), 'utf8'), + const benchmark: RunSummaryArtifact = JSON.parse( + readFileSync(path.join(outputDir, 'summary.json'), 'utf8'), ); expect(benchmark.metadata.targets).toEqual(['unknown']); expect(benchmark.metadata.tests_run).toEqual(['unknown']); diff --git a/apps/cli/test/commands/results/remote-auto-export.test.ts b/apps/cli/test/commands/results/remote-auto-export.test.ts index f1fead56e..cfbfadfe1 100644 --- a/apps/cli/test/commands/results/remote-auto-export.test.ts +++ b/apps/cli/test/commands/results/remote-auto-export.test.ts @@ -67,7 +67,7 @@ function writeRunArtifacts(projectDir: string): string { `${JSON.stringify({ test_id: 'alpha', score: 1 })}\n`, ); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), `${JSON.stringify({ eval_file: 'evals/example.eval.yaml', tests_run: 1 }, null, 2)}\n`, ); return runDir; @@ -121,7 +121,7 @@ function writeRunArtifactsWithPointers(projectDir: string): string { })}\n`, ); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), `${JSON.stringify({ eval_file: 'evals/example.eval.yaml', tests_run: 1 }, null, 2)}\n`, ); return runDir; @@ -227,7 +227,7 @@ describe('maybeAutoExportRunArtifacts', () => { rootDir, ); expect(resultTree).toContain('runs/default/run-002/index.jsonl'); - expect(resultTree).toContain('runs/default/run-002/benchmark.json'); + expect(resultTree).toContain('runs/default/run-002/summary.json'); expect(resultTree).not.toContain('runs/default/run-002/alpha/trace.json'); expect(resultTree).not.toContain('runs/default/run-002/alpha/transcript.jsonl'); const index = JSON.parse( diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts index e4767858a..0e69332ac 100644 --- a/apps/cli/test/commands/results/report.test.ts +++ b/apps/cli/test/commands/results/report.test.ts @@ -76,14 +76,14 @@ describe('results report', () => { expect(deriveReportPath(sourceFile)).toBe(path.join(tempDir, 'run', 'report.html')); }); - it('loads benchmark eval file metadata from a run workspace', async () => { + it('loads run summary eval file metadata from a run workspace', async () => { const runDir = path.join(tempDir, 'run'); await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' }); const loaded = await loadReportSource(runDir, tempDir); expect(loaded.results).toHaveLength(1); - expect(loaded.benchmarkEvalFile).toBe('demo'); + expect(loaded.summaryEvalFile).toBe('demo'); }); it('writes a static HTML report with grouped eval files and assertion type badges', async () => { diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 64fe8d68a..ca5f6943b 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -264,7 +264,7 @@ function writeRemoteRunArtifact( const records = Array.isArray(resultRecords) ? resultRecords : [resultRecords]; writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(...records)); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), JSON.stringify( { metadata: { @@ -303,7 +303,7 @@ function writeDirtyRemoteRunArtifact( mkdirSync(runDir, { recursive: true }); writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultRecord)); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), JSON.stringify( { metadata: { @@ -354,7 +354,7 @@ function writeLocalRunArtifact( mkdirSync(runDir, { recursive: true }); writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl({ ...resultRecord, experiment })); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), JSON.stringify( { metadata: { @@ -1597,7 +1597,7 @@ describe('serve app', () => { expect(git('git branch --show-current', cloneDir)).toBe('main'); expect( git( - 'git show-ref --verify --quiet refs/remotes/agentv-results/agentv-results && echo present || true', + 'git show-ref --verify --quiet refs/remotes/origin/agentv-results && echo present || true', cloneDir, ), ).toBe(''); @@ -2340,6 +2340,7 @@ describe('serve app', () => { '2026-03-26T12-30-00-000Z', RESULT_A, ); + git(`git remote set-url origin "${missingRemoteUrl}"`, cloneDir); const app = createApp([], tempDir, tempDir, undefined, { studioDir }); const res = await app.request('/api/projects/project-sync-offline/remote/sync', { method: 'POST', @@ -2679,7 +2680,7 @@ describe('serve app', () => { }; expect(tags.tags.sort()).toEqual(['baseline', 'candidate', 'shared']); const benchmark = JSON.parse( - readFileSync(path.join(combinedDir, 'benchmark.json'), 'utf8'), + readFileSync(path.join(combinedDir, 'summary.json'), 'utf8'), ) as { metadata: { combined_from_run_ids?: string[]; display_name?: string; timestamp?: string }; }; @@ -3460,7 +3461,7 @@ describe('serve app', () => { }), ); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), JSON.stringify( { metadata: { @@ -3496,7 +3497,7 @@ describe('serve app', () => { autoPush: false, }); - const artifactRemoteRef = `refs/remotes/agentv-results/${AGENTV_RESULTS_ARTIFACTS_REF}`; + const artifactRemoteRef = `refs/remotes/origin/${AGENTV_RESULTS_ARTIFACTS_REF}`; const artifactRefLookup = () => git( `git -C "${cloneDir}" show-ref --verify --quiet ${artifactRemoteRef} && echo present || true`, @@ -4404,11 +4405,11 @@ describe('serve app', () => { // // The Dashboard "Resume run" / "Rerun failed cases" buttons need the run dir // and the original eval file path to issue a launch request that targets - // the same run workspace. handleRunDetail reads benchmark.json's + // the same run workspace. handleRunDetail reads summary.json's // metadata.eval_file and reports the run dir relative to cwd. describe('GET /api/runs/:filename (resume metadata)', () => { - it('includes run_dir and suite_filter for local runs with benchmark.json', async () => { + it('includes run_dir and suite_filter for local runs with summary.json', async () => { const runsDir = localResultsExperimentDir(tempDir); mkdirSync(runsDir, { recursive: true }); const filename = '2026-05-06T00-00-00-000Z'; @@ -4416,7 +4417,7 @@ describe('serve app', () => { mkdirSync(runDir, { recursive: true }); writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), JSON.stringify( { metadata: { @@ -4446,7 +4447,7 @@ describe('serve app', () => { expect(data.suite_filter).toBe('examples/demo.eval.yaml'); }); - it('omits suite_filter when benchmark.json is missing', async () => { + it('omits suite_filter when summary.json is missing', async () => { const runsDir = localResultsExperimentDir(tempDir); mkdirSync(runsDir, { recursive: true }); const filename = '2026-05-06T00-00-01-000Z'; diff --git a/apps/cli/test/commands/results/validate.test.ts b/apps/cli/test/commands/results/validate.test.ts index 5ebdd0742..3836b90bc 100644 --- a/apps/cli/test/commands/results/validate.test.ts +++ b/apps/cli/test/commands/results/validate.test.ts @@ -25,7 +25,29 @@ describe('results validate', () => { test_id: 'test-greeting', score: 1, target: 'gpt-4o', + scores: [{ name: 'quality', type: 'llm', score: 1, verdict: 'pass' }], execution_status: 'ok', + summary_path: 'test-greeting/summary.json', + })}\n`, + ); + mkdirSync(path.join(runDir, 'test-greeting'), { recursive: true }); + writeFileSync( + path.join(runDir, 'test-greeting', 'summary.json'), + `${JSON.stringify({ + test_id: 'test-greeting', + score: 1, + target: 'gpt-4o', + execution_status: 'ok', + })}\n`, + ); + writeFileSync( + path.join(runDir, 'summary.json'), + `${JSON.stringify({ + schema_version: 1, + metadata: { + experiment: 'with-skills', + timestamp: '2026-03-27T12:42:24.429Z', + }, })}\n`, ); diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index 91b8934cf..dd9629bf6 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -408,7 +408,7 @@ describe('trace utils', () => { writeFileSync(path.join(runDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n`); writeFileSync( - path.join(runDir, 'benchmark.json'), + path.join(runDir, 'summary.json'), JSON.stringify({ metadata: { display_name: 'Combined run (dogfood-run-a + dogfood-run-b)', diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 083819931..7972e1ea8 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -343,10 +343,11 @@ describe('agentv eval CLI', () => { const results = await readJsonLines(path.join(outputDir, 'index.jsonl')); expect(results).toHaveLength(2); - await expectFileExists(path.join(outputDir, 'benchmark.json')); - await expectFileExists(path.join(outputDir, 'timing.json')); - await expectFileExists(path.join(outputDir, 'case-alpha', 'grading.json')); - await expectFileExists(path.join(outputDir, 'case-beta', 'grading.json')); + await expectFileExists(path.join(outputDir, 'summary.json')); + await expectFileExists(path.join(outputDir, 'case-alpha', 'summary.json')); + await expectFileExists(path.join(outputDir, 'case-alpha', 'run-1', 'grading.json')); + await expectFileExists(path.join(outputDir, 'case-beta', 'summary.json')); + await expectFileExists(path.join(outputDir, 'case-beta', 'run-1', 'grading.json')); } finally { await rm(fixture.baseDir, { recursive: true, force: true }); } @@ -363,8 +364,9 @@ describe('agentv eval CLI', () => { expect(exitCode).toBe(0); expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl')); await expectFileExists(path.join(outputDir, 'index.jsonl')); - await expectFileExists(path.join(outputDir, 'benchmark.json')); - await expectFileExists(path.join(outputDir, 'case-alpha', 'grading.json')); + await expectFileExists(path.join(outputDir, 'summary.json')); + await expectFileExists(path.join(outputDir, 'case-alpha', 'summary.json')); + await expectFileExists(path.join(outputDir, 'case-alpha', 'run-1', 'grading.json')); } finally { await rm(fixture.baseDir, { recursive: true, force: true }); } @@ -403,10 +405,9 @@ describe('agentv eval CLI', () => { const canonicalResults = await readJsonLines(path.join(outputDir, 'index.jsonl')); expect(canonicalResults).toHaveLength(2); - await expectFileExists(path.join(outputDir, 'benchmark.json')); - await expectFileExists(path.join(outputDir, 'timing.json')); + await expectFileExists(path.join(outputDir, 'summary.json')); for (const row of canonicalResults) { - expect(row.transcript_path).toMatch(/transcript\.jsonl$/); + expect(row.transcript_path).toMatch(/run-1\/transcript-raw\.jsonl$/); await expectFileExists(path.join(outputDir, row.transcript_path as string)); } } finally { @@ -592,7 +593,7 @@ describe('agentv eval CLI', () => { await expectFileExists(path.join(fixture.suiteDir, 'experiment-script.txt')); const benchmark = JSON.parse( - await readFile(path.join(path.dirname(outputPath), 'benchmark.json'), 'utf8'), + await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'), ) as { metadata?: Record }; expect(benchmark.metadata?.experiment).toBe('native-exp'); expect(benchmark.metadata?.experiment_config).toMatchObject({ @@ -782,7 +783,7 @@ describe('agentv eval CLI', () => { const helpText = `${result.stdout}\n${result.stderr}`; expect(helpText).not.toContain('--benchmark-json'); expect(helpText).toContain('--output'); - expect(helpText).toContain('benchmark.json'); + expect(helpText).toContain('summary.json'); }, 30_000); it('rejects the removed benchmark JSON export flag as an unknown argument', async () => { @@ -792,7 +793,7 @@ describe('agentv eval CLI', () => { 'eval', fixture.testFilePath, '--benchmark-json', - path.join(fixture.baseDir, 'benchmark.json'), + path.join(fixture.baseDir, 'summary.json'), ]); expect(result.exitCode).not.toBe(0); diff --git a/apps/dashboard/src/components/EvalDetail.tsx b/apps/dashboard/src/components/EvalDetail.tsx index 0ffd153af..4592197a6 100644 --- a/apps/dashboard/src/components/EvalDetail.tsx +++ b/apps/dashboard/src/components/EvalDetail.tsx @@ -6,7 +6,7 @@ * Assertions are grouped by grader name. */ -import { useMemo, useState } from 'react'; +import { useEffect, useMemo, useState } from 'react'; import { useQuery } from '@tanstack/react-query'; import { @@ -20,8 +20,10 @@ import { useEvalTranscript, useStudioConfig, } from '~/lib/api'; +import type { RepeatRunGroup } from '~/lib/result-table'; import type { AssertionEntry, + EvalCaseTrial, EvalResult, ScoreEntry, SourceCapturedFile, @@ -40,6 +42,11 @@ interface EvalDetailProps { eval: EvalResult; runId: string; projectId?: string; + repeatGroup?: RepeatRunGroup; + selectedTrial?: EvalCaseTrial | null; + initialTab?: Tab; + initialSelectedFilePath?: string | null; + onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void; } type Tab = 'checks' | 'transcript' | 'source' | 'files' | 'feedback'; @@ -56,11 +63,90 @@ function findFirstFile(nodes: FileNode[]): string | null { return null; } -export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) { - const [activeTab, setActiveTab] = useState('checks'); - const [selectedFilePath, setSelectedFilePath] = useState(null); +function caseTrialPath(trial: EvalCaseTrial, index = 0): string { + return trial.run_path ?? `run-${trial.attempt ?? index + 1}`; +} + +function caseTrialTokenTotal(trial: EvalCaseTrial): number | undefined { + if (trial.total_tokens != null) return trial.total_tokens; + const usage = trial.token_usage; + if (!usage) return undefined; + const values = [usage.input, usage.output, usage.reasoning, usage.cached].filter( + (value): value is number => typeof value === 'number' && Number.isFinite(value), + ); + return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : undefined; +} + +function formatPercent(value: number | undefined): string { + if (value == null || !Number.isFinite(value)) return '-'; + return `${Math.round(value * 100)}%`; +} + +function formatDuration(durationMs: number | undefined): string { + if (durationMs == null) return '-'; + if (durationMs < 1000) return `${Math.round(durationMs)}ms`; + if (durationMs < 60_000) return `${(durationMs / 1000).toFixed(1)}s`; + const minutes = Math.floor(durationMs / 60_000); + const seconds = Math.round((durationMs % 60_000) / 1000); + return `${minutes}m ${seconds}s`; +} + +function formatCost(costUsd: number | undefined): string | undefined { + if (costUsd == null) return undefined; + if (costUsd === 0) return '$0'; + if (costUsd < 0.01) return `$${costUsd.toFixed(5)}`; + return `$${costUsd.toFixed(4)}`; +} + +function formatTokens(tokens: number | undefined): string | undefined { + if (tokens == null) return undefined; + if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M tok`; + if (tokens >= 1000) return `${(tokens / 1000).toFixed(1)}k tok`; + return `${tokens} tok`; +} + +function selectedTrialResult(result: EvalResult, trial: EvalCaseTrial): EvalResult { + return { + ...result, + score: trial.score ?? result.score, + executionStatus: trial.execution_status ?? result.executionStatus, + error: trial.error, + costUsd: trial.cost_usd ?? result.costUsd, + durationMs: trial.duration_ms ?? result.durationMs, + scores: trial.scores, + assertions: trial.assertions, + trials: undefined, + aggregation: undefined, + grading_path: trial.grading_path, + timing_path: trial.timing_path, + metrics_path: trial.metrics_path, + transcript_path: trial.transcript_path, + output_path: trial.answer_path, + answer_path: trial.answer_path, + }; +} + +export function EvalDetail({ + eval: result, + runId, + projectId, + repeatGroup, + selectedTrial = null, + initialTab = 'checks', + initialSelectedFilePath = null, + onSelectTrial, +}: EvalDetailProps) { + const [activeTab, setActiveTab] = useState(initialTab); + const [selectedFilePath, setSelectedFilePath] = useState(initialSelectedFilePath); const { data: config } = useStudioConfig(projectId); const isReadOnly = config?.read_only === true; + const detailResult = selectedTrial ? selectedTrialResult(result, selectedTrial) : result; + const showAggregateRepeat = repeatGroup != null && selectedTrial == null; + + useEffect(() => { + setActiveTab(initialTab); + setSelectedFilePath(initialSelectedFilePath); + }, [initialTab, initialSelectedFilePath]); const tabs: { id: Tab; label: string }[] = [ { id: 'checks', label: 'Checks' }, @@ -76,7 +162,7 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) }; return ( -
+
{/* Tab navigation — at the top so Files tab editor fills maximum height */}
@@ -98,16 +184,32 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
{/* Tab content */} -
+
{activeTab === 'checks' && (
- + {showAggregateRepeat ? ( + + ) : selectedTrial ? ( + + ) : ( + + )}
)} {activeTab === 'files' && ( -
+
- + {showAggregateRepeat ? ( + + ) : selectedTrial ? ( + + ) : ( + + )}
)} {activeTab === 'source' && (
- +
)} {!isReadOnly && activeTab === 'feedback' && (
- +
)}
@@ -433,6 +553,231 @@ function ChecksTab({ result, projectId }: { result: EvalResult; projectId?: stri ); } +function RunMetricRow({ label, value }: { label: string; value: string | undefined }) { + return ( +
+
{label}
+
{value ?? '-'}
+
+ ); +} + +function TrialActionRow({ + trial, + index, + onSelectTrial, +}: { + trial: EvalCaseTrial; + index: number; + onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void; +}) { + const label = caseTrialPath(trial, index); + return ( +
+
+
{label}
+
+ {formatPercent(trial.score)} score + {trial.verdict ?? 'unknown'} + {trial.duration_ms != null ? {formatDuration(trial.duration_ms)} : null} + {trial.total_tool_calls != null ? {trial.total_tool_calls} tool calls : null} +
+
+
+ + +
+
+ ); +} + +function RepeatAggregateChecksTab({ + result, + group, + onSelectTrial, +}: { + result: EvalResult; + group: RepeatRunGroup; + onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void; +}) { + return ( +
+
+
+ + + + +
+
+ + {result.scores && result.scores.length > 0 ? ( +
+

Aggregate Grader Scores

+
+ {result.scores.map((score, index) => ( +
+ + {score.name ?? score.type ?? `Score ${index + 1}`} + +
+ +
+
+ ))} +
+
+ ) : null} + +
+

Runs

+ {group.trials.map((trial, index) => ( + + ))} +
+
+ ); +} + +type ParsedGradingArtifact = { + assertions: AssertionEntry[]; + summary?: { + passed?: number; + failed?: number; + total?: number; + pass_rate?: number; + }; + error?: string; +}; + +function parseGradingArtifact(content: string | undefined): ParsedGradingArtifact | null { + if (!content) return null; + try { + const parsed = JSON.parse(content) as Record; + const rawAssertions = Array.isArray(parsed.assertions) ? parsed.assertions : []; + const assertions = rawAssertions.flatMap((value): AssertionEntry[] => { + if (!value || typeof value !== 'object') return []; + const assertion = value as Record; + if (typeof assertion.text !== 'string' || typeof assertion.passed !== 'boolean') { + return []; + } + return [ + { + text: assertion.text, + passed: assertion.passed, + evidence: typeof assertion.evidence === 'string' ? assertion.evidence : undefined, + }, + ]; + }); + const summary = + parsed.summary && typeof parsed.summary === 'object' ? parsed.summary : undefined; + return { assertions, summary: summary as ParsedGradingArtifact['summary'] }; + } catch (error) { + return { assertions: [], error: error instanceof Error ? error.message : String(error) }; + } +} + +function TrialChecksTab({ + result, + trial, + runId, + projectId, + onOpenFile, +}: { + result: EvalResult; + trial: EvalCaseTrial; + runId: string; + projectId?: string; + onOpenFile: (path: string) => void; +}) { + const gradingPath = trial.grading_path; + const artifactDir = result.artifact_dir; + const evalId = result.testId; + const { data: gradingContent, isLoading } = + projectId && gradingPath + ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, gradingPath, artifactDir)) + : useEvalFileContent(runId, evalId, gradingPath ?? '', artifactDir); + const parsed = parseGradingArtifact(gradingContent?.content); + + if (!gradingPath) { + return ; + } + + return ( +
+
+
+ Run score +
+ +
+
+
+ +
+ + + +
+ +
+
+

Grading

+ +
+ {isLoading ? ( +

Loading grading artifact...

+ ) : null} + {parsed?.error ?

{parsed.error}

: null} + {parsed?.summary ? ( +
+ + + +
+ ) : null} +
+ + {parsed && parsed.assertions.length > 0 ? ( +
+ {parsed.assertions.map((assertion, index) => ( + + ))} +
+ ) : !isLoading ? ( +

No assertion steps recorded in grading.json.

+ ) : null} +
+ ); +} + function containsFilePath(nodes: FileNode[], filePath: string | null): boolean { if (!filePath) return false; for (const node of nodes) { @@ -442,6 +787,193 @@ function containsFilePath(nodes: FileNode[], filePath: string | null): boolean { return false; } +function RepeatAggregateTranscriptTab({ + result, + group, + runId, + projectId, + onSelectTrial, +}: { + result: EvalResult; + group: RepeatRunGroup; + runId: string; + projectId?: string; + onSelectTrial?: (trial: EvalCaseTrial, initialTab?: Tab) => void; +}) { + return ( +
+

+ Run transcripts +

+ {group.trials.map((trial, index) => { + const runLabel = caseTrialPath(trial, index); + const transcriptPath = trial.transcript_path; + const transcriptHref = transcriptPath + ? artifactFileContentUrl({ + projectId, + runId, + evalId: result.testId, + filePath: transcriptPath, + artifactDir: result.artifact_dir, + raw: true, + }) + : undefined; + return ( +
+
+
{runLabel}
+
+ {transcriptPath ?? 'No transcript artifact'} +
+
+
+ + {transcriptHref ? ( + + Raw + + ) : null} +
+
+ ); + })} +
+ ); +} + +function TrialTranscriptTab({ + result, + trial, + runId, + projectId, + onOpenFile, +}: { + result: EvalResult; + trial: EvalCaseTrial; + runId: string; + projectId?: string; + onOpenFile: (path: string) => void; +}) { + const evalId = result.testId; + const artifactDir = result.artifact_dir; + const transcriptPath = trial.transcript_path; + const answerPath = trial.answer_path; + const { data: transcriptContent, isLoading: isLoadingTranscript } = + projectId && transcriptPath + ? useQuery( + projectEvalFileContentOptions(projectId, runId, evalId, transcriptPath, artifactDir), + ) + : useEvalFileContent(runId, evalId, transcriptPath ?? '', artifactDir); + const { data: answerContent } = + projectId && answerPath + ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, answerPath, artifactDir)) + : useEvalFileContent(runId, evalId, answerPath ?? '', artifactDir); + + const transcriptValue = transcriptContent?.content ?? ''; + const parsedTranscript = useMemo(() => parseTranscriptJsonl(transcriptValue), [transcriptValue]); + + if (!transcriptPath) { + return ( +
+

No structured transcript

+

+ This run does not include a transcript artifact. +

+
+ ); + } + + if (isLoadingTranscript) { + return ( +
+ Loading transcript artifact... +
+ ); + } + + if (parsedTranscript.error) { + return ( +
+

Transcript could not be parsed

+

{parsedTranscript.error}

+ +
+ ); + } + + if (parsedTranscript.entries.length === 0) { + return ( +
+

Empty transcript

+

+ {transcriptPath} exists but contains no JSONL rows. +

+
+ ); + } + + const answerHref = answerPath + ? artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: answerPath, + artifactDir, + raw: true, + }) + : undefined; + const transcriptHref = artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: transcriptPath, + artifactDir, + raw: true, + }); + const transcriptDownloadHref = artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: transcriptPath, + artifactDir, + download: true, + }); + + return ( + + ); +} + function TranscriptTab({ result, runId, @@ -454,13 +986,14 @@ function TranscriptTab({ onOpenFile: (path: string) => void; }) { const evalId = result.testId; + const artifactDir = result.artifact_dir; const { data: transcriptData, isLoading: isLoadingTranscript, error: transcriptError, } = projectId - ? useQuery(projectEvalTranscriptOptions(projectId, runId, evalId)) - : useEvalTranscript(runId, evalId); + ? useQuery(projectEvalTranscriptOptions(projectId, runId, evalId, artifactDir)) + : useEvalTranscript(runId, evalId, artifactDir); const transcriptPath = transcriptData?.transcript_path; const answerPath = transcriptData?.answer_path; const transcriptContent = transcriptData?.status === 'ok' ? (transcriptData.content ?? '') : ''; @@ -541,6 +1074,7 @@ function TranscriptTab({ runId, evalId, filePath: transcriptPath, + artifactDir, raw: true, })} target="_blank" @@ -568,7 +1102,14 @@ function TranscriptTab({ } const answerHref = answerPath - ? artifactFileContentUrl({ projectId, runId, evalId, filePath: answerPath, raw: true }) + ? artifactFileContentUrl({ + projectId, + runId, + evalId, + filePath: answerPath, + artifactDir, + raw: true, + }) : undefined; const transcriptHref = transcriptPath ? artifactFileContentUrl({ @@ -576,6 +1117,7 @@ function TranscriptTab({ runId, evalId, filePath: transcriptPath, + artifactDir, raw: true, }) : undefined; @@ -585,6 +1127,7 @@ function TranscriptTab({ runId, evalId, filePath: transcriptPath, + artifactDir, download: true, }) : undefined; @@ -617,11 +1160,12 @@ function FilesTab({ onSelectedPathChange: (path: string) => void; }) { const evalId = result.testId; + const artifactDir = result.artifact_dir; // Use project-scoped API hooks when projectId is present const { data: filesData } = projectId - ? useQuery(projectEvalFilesOptions(projectId, runId, evalId)) - : useEvalFiles(runId, evalId); + ? useQuery(projectEvalFilesOptions(projectId, runId, evalId, artifactDir)) + : useEvalFiles(runId, evalId, artifactDir); const files = filesData?.files ?? []; const [localSelectedPath, setLocalSelectedPath] = useState(null); @@ -635,8 +1179,10 @@ function FilesTab({ : null; const { data: fileContentData, isLoading: isLoadingContent } = projectId - ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, effectivePath ?? '')) - : useEvalFileContent(runId, evalId, effectivePath ?? ''); + ? useQuery( + projectEvalFileContentOptions(projectId, runId, evalId, effectivePath ?? '', artifactDir), + ) + : useEvalFileContent(runId, evalId, effectivePath ?? '', artifactDir); if (files.length === 0) { return

No artifact files available.

; @@ -651,9 +1197,11 @@ function FilesTab({ const displayLanguage = effectivePath ? (fileContentData?.language ?? 'plaintext') : 'plaintext'; return ( -
+
{/* FileTree panel — desktop: side-by-side, mobile: full-width slide-over */} -
+
{/* MonacoViewer panel — desktop: side-by-side, mobile: full-width */} -
+
diff --git a/apps/dashboard/src/components/ResultTable.tsx b/apps/dashboard/src/components/ResultTable.tsx index 4660f3aea..37b577ad8 100644 --- a/apps/dashboard/src/components/ResultTable.tsx +++ b/apps/dashboard/src/components/ResultTable.tsx @@ -7,24 +7,25 @@ */ import type React from 'react'; -import { useEffect, useMemo, useState } from 'react'; - -import { Link } from '@tanstack/react-router'; +import { Fragment, useEffect, useMemo, useState } from 'react'; import { useFeedback } from '~/lib/api'; import { RESULT_TABLE_VIEW_PRESETS, + type RepeatRunGroup, type ResultTableColumn, type ResultTableRow, type ResultTableState, type ResultTableStateInput, buildResultTableModel, } from '~/lib/result-table'; -import type { EvalResult, ScoreEntry } from '~/lib/types'; +import type { EvalCaseTrial, EvalResult, ScoreEntry } from '~/lib/types'; import { EvalDetail } from './EvalDetail'; import { PassRatePill } from './PassRatePill'; +type DetailTab = 'checks' | 'transcript' | 'source' | 'files' | 'feedback'; + interface ResultTableProps { results: readonly EvalResult[]; runId: string; @@ -44,6 +45,9 @@ const QUERY_KEYS = { detail: 'results_detail', } as const; +const CHECK_MARK = '\u2713'; +const CROSS_MARK = '\u2717'; + function readUrlState(): ResultTableStateInput { if (typeof window === 'undefined') return {}; const params = new URLSearchParams(window.location.search); @@ -132,6 +136,24 @@ function formatTokens(tokens: number | undefined): string | undefined { return `${tokens} tok`; } +function tokenUsageTotal( + usage: EvalCaseTrial['token_usage'] | EvalResult['tokenUsage'], +): number | undefined { + if (!usage) return undefined; + const values = [usage.input, usage.output, usage.reasoning, usage.cached].filter( + (value): value is number => typeof value === 'number' && Number.isFinite(value), + ); + return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : undefined; +} + +function caseTrialTokenTotal(trial: EvalCaseTrial): number | undefined { + return trial.total_tokens ?? tokenUsageTotal(trial.token_usage); +} + +function caseTrialPath(trial: EvalCaseTrial, index = 0): string { + return trial.run_path ?? `run-${trial.attempt ?? index + 1}`; +} + function compactTokenBreakdown(result: EvalResult): string | undefined { const usage = result.tokenUsage; if (!usage) return undefined; @@ -166,6 +188,12 @@ export function ResultTable({ }: ResultTableProps) { const [urlState, setUrlState] = useState(() => readUrlState()); const [selectedRowKey, setSelectedRowKey] = useState(() => readSelectedRowKey()); + const [selectedTrialPath, setSelectedTrialPath] = useState(null); + const [selectedDetailFilePath, setSelectedDetailFilePath] = useState(null); + const [selectedDetailTab, setSelectedDetailTab] = useState('checks'); + const [collapsedRepeatRows, setCollapsedRepeatRows] = useState>( + () => new Set(), + ); const { data: feedback } = useFeedback(projectId); const reviewedTestIds = useMemo( () => feedback?.reviews.map((review) => review.test_id) ?? [], @@ -186,11 +214,25 @@ export function ResultTable({ selectedRowKey != null ? (model.filteredRows.find((row) => row.key === selectedRowKey) ?? null) : null; + const repeatGroupsByRowKey = useMemo( + () => new Map(model.repeatGroups.map((group) => [group.row.key, group])), + [model.repeatGroups], + ); + const selectedRepeatGroup = selectedRow ? repeatGroupsByRowKey.get(selectedRow.key) : undefined; + const selectedTrial = + selectedRepeatGroup && selectedTrialPath + ? (selectedRepeatGroup.trials.find( + (trial, index) => caseTrialPath(trial, index) === selectedTrialPath, + ) ?? null) + : null; useEffect(() => { const handlePopState = () => { setUrlState(readUrlState()); setSelectedRowKey(readSelectedRowKey()); + setSelectedTrialPath(null); + setSelectedDetailFilePath(null); + setSelectedDetailTab('checks'); }; window.addEventListener('popstate', handlePopState); return () => window.removeEventListener('popstate', handlePopState); @@ -219,6 +261,9 @@ export function ResultTable({ window.history.replaceState(window.history.state, '', nextUrl); setUrlState({}); setSelectedRowKey(null); + setSelectedTrialPath(null); + setSelectedDetailFilePath(null); + setSelectedDetailTab('checks'); } function toggleColumn(columnId: string) { @@ -231,11 +276,34 @@ export function ResultTable({ function openRowDetail(rowKey: string) { writeSelectedRowKey(rowKey); setSelectedRowKey(rowKey); + setSelectedTrialPath(null); + setSelectedDetailFilePath(null); + setSelectedDetailTab('checks'); + } + + function openTrialDetail(rowKey: string, trial: EvalCaseTrial, initialTab: DetailTab = 'checks') { + writeSelectedRowKey(rowKey); + setSelectedRowKey(rowKey); + setSelectedTrialPath(caseTrialPath(trial)); + setSelectedDetailTab(initialTab); + setSelectedDetailFilePath(primaryTrialArtifactPath(trial)); } function closeRowDetail() { writeSelectedRowKey(null); setSelectedRowKey(null); + setSelectedTrialPath(null); + setSelectedDetailFilePath(null); + setSelectedDetailTab('checks'); + } + + function toggleRepeatGroup(rowKey: string) { + setCollapsedRepeatRows((current) => { + const next = new Set(current); + if (next.has(rowKey)) next.delete(rowKey); + else next.add(rowKey); + return next; + }); } if (results.length === 0) { @@ -370,58 +438,18 @@ export function ResultTable({

) : ( -
- - - - {model.visibleColumns.map((column) => ( - - ))} - - - - {model.filteredRows.map((row) => { - const isSelected = selectedRowKey === row.key; - return ( - - {model.visibleColumns.map((column) => ( - - ))} - - ); - })} - -
- {column.label} -
- -
-
+ )}
@@ -430,6 +458,14 @@ export function ResultTable({ row={selectedRow} runId={runId} projectId={projectId} + repeatGroup={selectedRepeatGroup} + selectedTrial={selectedTrial} + selectedTrialPath={selectedTrialPath} + initialTab={selectedDetailTab} + initialFilePath={selectedDetailFilePath} + onOpenTrialDetail={(trial, initialTab) => + openTrialDetail(selectedRow.key, trial, initialTab) + } onClose={closeRowDetail} /> )} @@ -438,22 +474,326 @@ export function ResultTable({ ); } +function ResultRowsTable({ + rows, + visibleColumns, + passThreshold, + selectedRowKey, + selectedTrialPath, + repeatGroupsByRowKey, + collapsedRepeatRows, + onToggleRepeatGroup, + onOpenDetail, + onOpenTrialDetail, +}: { + rows: readonly ResultTableRow[]; + visibleColumns: readonly ResultTableColumn[]; + passThreshold: number; + selectedRowKey: string | null; + selectedTrialPath: string | null; + repeatGroupsByRowKey: ReadonlyMap; + collapsedRepeatRows: ReadonlySet; + onToggleRepeatGroup: (rowKey: string) => void; + onOpenDetail: (rowKey: string) => void; + onOpenTrialDetail: (rowKey: string, trial: EvalCaseTrial) => void; +}) { + return ( +
+ + + + {visibleColumns.map((column) => ( + + ))} + + + + {rows.map((row) => { + const repeatGroup = repeatGroupsByRowKey.get(row.key); + const isSelected = selectedRowKey === row.key && !selectedTrialPath; + const collapsed = repeatGroup ? collapsedRepeatRows.has(row.key) : true; + return ( + + onOpenDetail(row.key)} + onKeyDown={(event) => { + if (event.key === 'Enter' || event.key === ' ') { + event.preventDefault(); + onOpenDetail(row.key); + } + }} + tabIndex={0} + aria-selected={isSelected} + > + {visibleColumns.map((column) => ( + + ))} + + {repeatGroup && !collapsed + ? repeatGroup.trials.map((trial, index) => { + const trialPath = caseTrialPath(trial, index); + const trialSelected = + selectedRowKey === row.key && selectedTrialPath === trialPath; + return ( + onOpenTrialDetail(row.key, trial)} + onKeyDown={(event) => { + if (event.key === 'Enter' || event.key === ' ') { + event.preventDefault(); + onOpenTrialDetail(row.key, trial); + } + }} + tabIndex={0} + aria-selected={trialSelected} + > + {visibleColumns.map((column) => ( + + ))} + + ); + }) + : null} + + ); + })} + +
+ + {column.label} + +
+ +
+ +
+
+ ); +} + +function caseTrialPassed(trial: EvalCaseTrial, passThreshold: number): boolean { + if (trial.verdict === 'pass') return true; + if (trial.verdict === 'fail') return false; + return typeof trial.score === 'number' ? trial.score >= passThreshold : false; +} + +function primaryTrialArtifactPath(trial: EvalCaseTrial): string | null { + return ( + trial.grading_path ?? + trial.metrics_path ?? + trial.timing_path ?? + trial.transcript_path ?? + trial.answer_path ?? + null + ); +} + +function TrialResultCell({ + column, + row, + trial, + index, + passThreshold, +}: { + column: ResultTableColumn; + row: ResultTableRow; + trial: EvalCaseTrial; + index: number; + passThreshold: number; +}) { + const passed = caseTrialPassed(trial, passThreshold); + const isExecutionError = trial.execution_status === 'execution_error'; + const status = isExecutionError ? 'error' : passed ? 'passing' : 'failing'; + const statusLabel = isExecutionError ? 'Error' : passed ? 'Passing' : 'Failing'; + const label = caseTrialPath(trial, index); + + switch (column.id) { + case 'status': + return ; + case 'expander': + return