diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 5e2eb1058..6eeaff397 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1,11 +1,11 @@ import { spawn } from 'node:child_process'; import { constants, existsSync, mkdirSync } from 'node:fs'; import { access, readFile } from 'node:fs/promises'; +import { createRequire } from 'node:module'; import path from 'node:path'; import { pathToFileURL } from 'node:url'; import { - DEFAULT_EVAL_PATTERNS, DEFAULT_THRESHOLD, type EvalTargetRef, type EvalTest, @@ -89,6 +89,10 @@ import type { TaskBundleTargetSelection } from './task-bundle.js'; import { WipCheckpointLoop } from './wip-checkpoint.js'; const DEFAULT_WORKERS = 3; +const require = createRequire(import.meta.url); +const micromatch = require('micromatch') as { + isMatch(id: string, pattern: string): boolean; +}; function shouldSkipExistingResultForResume( result: Pick, @@ -156,6 +160,7 @@ interface NormalizedOptions { readonly experimentMetadata?: ExperimentArtifactMetadata; readonly experimentTargetRefs?: readonly EvalTargetRef[]; readonly experimentTrialsConfig?: TrialsConfig; + readonly suiteFiltersByEvalFile?: ReadonlyMap; readonly budgetUsd?: number; readonly sourceMetadataByEvalFile?: ReadonlyMap>; readonly resultsOverrides?: ResultsPublishOverrides; @@ -650,8 +655,6 @@ function applyExperimentOptions( const workspaceMode = options.workspaceMode ?? readExperimentWorkspaceMode(experiment.workspace?.mode); const workspacePath = options.workspacePath ?? readExperimentWorkspacePath(experiment.workspace); - const experimentFilter = normalizeExperimentCaseFilter(experiment.evals); - return { ...options, target: options.target ?? (nextCliTargets.length === 1 ? nextCliTargets[0] : undefined), @@ -660,7 +663,6 @@ function applyExperimentOptions( workers: options.workers ?? experiment.workers, workspaceMode: workspacePath ? 'static' : workspaceMode, workspacePath, - filter: options.filter ?? experimentFilter, budgetUsd: options.budgetUsd ?? experiment.budgetUsd, experimentConfig: experiment, experimentMetadata: buildExperimentArtifactMetadata(experiment), @@ -724,16 +726,53 @@ function readExperimentWorkspacePath( return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined; } -function normalizeExperimentCaseFilter( - evals: ExperimentConfig['evals'] | undefined, -): NormalizedOptions['filter'] { - if (typeof evals === 'string') { - return evals; - } - if (!Array.isArray(evals)) { +type ExperimentSuiteSelection = { + readonly testFiles: readonly string[]; + readonly filtersByEvalFile: ReadonlyMap; +}; + +function matchesTestFilter(id: string, filter: string | readonly string[]): boolean { + return typeof filter === 'string' + ? micromatch.isMatch(id, filter) + : filter.some((pattern) => micromatch.isMatch(id, pattern)); +} + +async function resolveExperimentSuiteSelection( + suites: ExperimentConfig['suites'] | undefined, + cwd: string, +): Promise { + if (!suites || suites.length === 0) { return undefined; } - return evals.length === 1 ? evals[0] : [...evals]; + + const testFiles = new Set(); + const selectedTestIdsByEvalFile = new Map(); + + for (const suite of suites) { + const resolvedSuiteFiles = await resolveEvalPaths([suite.ref], cwd); + for (const testFilePath of resolvedSuiteFiles) { + const resolvedPath = path.resolve(testFilePath); + testFiles.add(resolvedPath); + if (suite.select?.testIds && suite.select.testIds.length > 0) { + const existing = selectedTestIdsByEvalFile.get(resolvedPath) ?? []; + selectedTestIdsByEvalFile.set(resolvedPath, [...existing, ...suite.select.testIds]); + } + } + } + + const filtersByEvalFile = new Map(); + for (const [testFilePath, testIds] of selectedTestIdsByEvalFile.entries()) { + const uniqueTestIds = [...new Set(testIds)]; + filtersByEvalFile.set( + testFilePath, + uniqueTestIds.length === 1 ? uniqueTestIds[0] : uniqueTestIds, + ); + } + + return { + testFiles: [...testFiles], + filtersByEvalFile, + }; } async function runExperimentSteps(params: { @@ -992,6 +1031,7 @@ async function prepareFileMetadata(params: { readonly repoRoot: string; readonly cwd: string; readonly options: NormalizedOptions; + readonly suiteFilter?: string | readonly string[]; }): Promise<{ readonly testIds: readonly string[]; readonly testCases: readonly EvalTest[]; @@ -1009,7 +1049,7 @@ async function prepareFileMetadata(params: { target: import('@agentv/core').ResolvedTarget, ) => import('@agentv/core').Provider; }> { - const { testFilePath, repoRoot, cwd, options } = params; + const { testFilePath, repoRoot, cwd, options, suiteFilter } = params; await ensureFileExists(testFilePath, 'Test file'); await loadEnvFromHierarchy({ @@ -1023,16 +1063,20 @@ async function prepareFileMetadata(params: { const suite = await loadTestSuite(testFilePath, repoRoot, { verbose: options.verbose, - filter: options.filter, + filter: suiteFilter ?? options.filter, category, }); - const testIds = suite.tests.map((value) => value.id); + const testCases = + suiteFilter && options.filter + ? suite.tests.filter((testCase) => matchesTestFilter(testCase.id, options.filter ?? '')) + : suite.tests; + const testIds = testCases.map((value) => value.id); const suiteTargets = suite.targets; - if (suite.tests.length === 0) { + if (testCases.length === 0) { return { testIds, - testCases: suite.tests, + testCases, selections: [], trialsConfig: options.experimentTrialsConfig, suiteTargets, @@ -1195,7 +1239,7 @@ async function prepareFileMetadata(params: { return { testIds, - testCases: suite.tests, + testCases, selections, trialsConfig: options.experimentTrialsConfig, suiteTargets, @@ -1529,11 +1573,15 @@ export async function runEvalCommand( experiment: resolvedExperiment.name, }; + const suiteSelection = await resolveExperimentSuiteSelection( + options.experimentConfig?.suites, + cwd, + ); const evalPathInputs = input.testFiles.length > 0 ? [...input.testFiles] - : options.experimentConfig?.evals !== undefined - ? [...(yamlConfig?.eval_patterns ?? DEFAULT_EVAL_PATTERNS)] + : suiteSelection + ? [...suiteSelection.testFiles] : []; if (evalPathInputs.length === 0 && process.stdin.isTTY) { const { launchInteractiveWizard } = await import('./interactive.js'); @@ -1541,6 +1589,12 @@ export async function runEvalCommand( return undefined; } const resolvedTestFiles = await resolveEvalPaths(evalPathInputs, cwd); + options = { + ...options, + ...(suiteSelection !== undefined && { + suiteFiltersByEvalFile: suiteSelection.filtersByEvalFile, + }), + }; if (!process.env.AGENTV_EXPERIMENT) { process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment); @@ -1813,6 +1867,7 @@ export async function runEvalCommand( repoRoot, cwd, options, + suiteFilter: options.suiteFiltersByEvalFile?.get(path.resolve(testFilePath)), }); fileMetadata.set(testFilePath, meta); } diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 083819931..f325ca537 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -518,7 +518,7 @@ describe('agentv eval CLI', () => { } }, 30_000); - it('runs a native experiment file with eval selection and run knobs', async () => { + it('runs a native experiment file with suite test selection and run knobs', async () => { const fixture = await createFixture(); try { const experimentsDir = path.join(fixture.suiteDir, 'experiments'); @@ -548,7 +548,11 @@ describe('agentv eval CLI', () => { [ 'name: native-exp', 'target: cli-target', - 'evals: case-alpha', + 'suites:', + ' - ref: sample.test.yaml', + ' select:', + ' test_ids:', + ' - case-alpha', 'timeout_seconds: 12', 'workers: 4', 'repeat:', @@ -580,6 +584,7 @@ describe('agentv eval CLI', () => { target: 'cli-target', agentTimeoutMs: 12000, maxConcurrency: 4, + evalCaseIds: ['case-alpha'], trials: { count: 2, strategy: 'mean', @@ -599,7 +604,14 @@ describe('agentv eval CLI', () => { name: 'native-exp', source_path: experimentPath, target: 'cli-target', - evals: 'case-alpha', + suites: [ + { + ref: 'sample.test.yaml', + select: { + test_ids: ['case-alpha'], + }, + }, + ], repeat: { count: 2, strategy: 'mean', diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx index 5e6f6a63b..de6a5b80b 100644 --- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx +++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx @@ -16,7 +16,12 @@ Committed experiments conventionally live under `experiments/`: ```yaml name: baseline target: codex-gpt5 -evals: "agent-*" +suites: + - ref: evals/support-regression.eval.yaml + select: + test_ids: + - refund-eligibility + - missing-order-date timeout_seconds: 720 repeat: count: 4 @@ -31,6 +36,30 @@ scripts: Wire fields use `snake_case`. AgentV translates to internal `camelCase` when it loads the file. +## Suites and test selection + +Eval files keep `tests[]` as the canonical atomic test definition. Experiments +reference one or more reusable eval suites through `suites[]`: + +```yaml +suites: + - ref: evals/support-regression.eval.yaml + - ref: evals/billing-*.eval.yaml +``` + +Use suite-local `select.test_ids[]` to run only specific tests from a suite. The +values match `tests[].id` inside that suite and use the same glob semantics as +`--test-id`: + +```yaml +suites: + - ref: evals/support-regression.eval.yaml + select: + test_ids: + - refund-* + - missing-order-date +``` + ## Repeat runs `repeat` is the full AgentV replacement for the old eval-level @@ -128,7 +157,7 @@ scripts: Run a specific experiment: ```bash -bun agentv eval evals/suite.eval.yaml --experiment experiments/default.yaml +bun agentv eval --experiment experiments/default.yaml ``` If no experiment is passed, AgentV checks `.agentv/config.yaml` for a default: diff --git a/examples/features/trials/README.md b/examples/features/trials/README.md index e1a3e161a..12241151f 100644 --- a/examples/features/trials/README.md +++ b/examples/features/trials/README.md @@ -14,8 +14,7 @@ behavior in committed experiment files. ## Run ```bash -bun agentv eval examples/features/trials/evals/dataset.eval.yaml \ - --experiment examples/features/trials/experiments/default.yaml +bun agentv eval --experiment examples/features/trials/experiments/default.yaml ``` Swap the experiment path to try the other strategies. diff --git a/examples/features/trials/experiments/confidence-interval.yaml b/examples/features/trials/experiments/confidence-interval.yaml index 27a9b915d..50199244d 100644 --- a/examples/features/trials/experiments/confidence-interval.yaml +++ b/examples/features/trials/experiments/confidence-interval.yaml @@ -1,8 +1,11 @@ name: trials-confidence-interval target: llm -evals: - - math-basics - - capital-knowledge +suites: + - ref: examples/features/trials/evals/dataset.eval.yaml + select: + test_ids: + - math-basics + - capital-knowledge repeat: count: 5 strategy: confidence_interval diff --git a/examples/features/trials/experiments/default.yaml b/examples/features/trials/experiments/default.yaml index af9dcb477..41f173be1 100644 --- a/examples/features/trials/experiments/default.yaml +++ b/examples/features/trials/experiments/default.yaml @@ -1,8 +1,11 @@ name: trials target: llm -evals: - - math-basics - - capital-knowledge +suites: + - ref: examples/features/trials/evals/dataset.eval.yaml + select: + test_ids: + - math-basics + - capital-knowledge repeat: count: 2 strategy: pass_at_k diff --git a/examples/features/trials/experiments/mean.yaml b/examples/features/trials/experiments/mean.yaml index 1c534a8b9..fab967b8f 100644 --- a/examples/features/trials/experiments/mean.yaml +++ b/examples/features/trials/experiments/mean.yaml @@ -1,8 +1,11 @@ name: trials-mean target: llm -evals: - - math-basics - - capital-knowledge +suites: + - ref: examples/features/trials/evals/dataset.eval.yaml + select: + test_ids: + - math-basics + - capital-knowledge repeat: count: 3 strategy: mean diff --git a/examples/showcase/multi-model-benchmark/README.md b/examples/showcase/multi-model-benchmark/README.md index 3ad5085ea..b22675a9c 100644 --- a/examples/showcase/multi-model-benchmark/README.md +++ b/examples/showcase/multi-model-benchmark/README.md @@ -37,8 +37,7 @@ From the repository root: ```bash # Run the full matrix (all targets × all tests × 2 repeat attempts) -bun agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml \ - --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml +bun agentv eval --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml ``` ### Cost & Safety @@ -132,11 +131,14 @@ targets: - copilot - claude - gemini-llm -evals: - - factual-* - - analytical-comparison - - creative-explanation - - structured-list +suites: + - ref: examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml + select: + test_ids: + - factual-* + - analytical-comparison + - creative-explanation + - structured-list repeat: count: 2 strategy: pass_at_k diff --git a/examples/showcase/multi-model-benchmark/experiments/default.yaml b/examples/showcase/multi-model-benchmark/experiments/default.yaml index 007167966..cb941192e 100644 --- a/examples/showcase/multi-model-benchmark/experiments/default.yaml +++ b/examples/showcase/multi-model-benchmark/experiments/default.yaml @@ -3,12 +3,15 @@ targets: - copilot - claude - gemini-llm -evals: - - factual-geography - - factual-science - - analytical-comparison - - creative-explanation - - structured-list +suites: + - ref: examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml + select: + test_ids: + - factual-geography + - factual-science + - analytical-comparison + - creative-explanation + - structured-list repeat: count: 2 strategy: pass_at_k diff --git a/packages/core/src/evaluation/experiment.ts b/packages/core/src/evaluation/experiment.ts index 2d1a512a8..a572511b3 100644 --- a/packages/core/src/evaluation/experiment.ts +++ b/packages/core/src/evaluation/experiment.ts @@ -58,6 +58,25 @@ export type ExperimentRepeat = { readonly costLimitUsd?: number; }; +export type ExperimentSuiteSelectWire = { + readonly test_ids?: readonly string[]; + readonly testIds?: readonly string[]; +}; + +export type ExperimentSuiteSelect = { + readonly testIds: readonly string[]; +}; + +export type ExperimentSuiteRefWire = { + readonly ref: string; + readonly select?: ExperimentSuiteSelectWire; +}; + +export type ExperimentSuiteRef = { + readonly ref: string; + readonly select?: ExperimentSuiteSelect; +}; + export type ExperimentConfigWire = { readonly name?: string; readonly agent?: string; @@ -65,7 +84,7 @@ export type ExperimentConfigWire = { readonly targets?: readonly ExperimentTargetRefWire[]; readonly model?: string; readonly agent_options?: Record; - readonly evals?: string | readonly string[]; + readonly suites?: readonly ExperimentSuiteRefWire[]; readonly scripts?: readonly ExperimentScriptWire[]; readonly repeat?: ExperimentRepeatWire; readonly runs?: number; @@ -85,7 +104,7 @@ export type ExperimentConfig = { readonly targets?: readonly ExperimentTargetRef[]; readonly model?: string; readonly agentOptions?: Record; - readonly evals?: string | readonly string[]; + readonly suites?: readonly ExperimentSuiteRef[]; readonly scripts?: readonly ExperimentScript[]; readonly repeat?: ExperimentRepeat; readonly runs?: number; @@ -108,7 +127,12 @@ export type ExperimentArtifactMetadata = { readonly target?: string; readonly targets?: readonly string[]; readonly model?: string; - readonly evals?: string | readonly string[]; + readonly suites?: readonly { + readonly ref: string; + readonly select?: { + readonly test_ids: readonly string[]; + }; + }[]; readonly repeat?: { readonly count: number; readonly strategy: TrialStrategy; @@ -187,7 +211,7 @@ export function normalizeExperimentConfig( const targets = readTargets(rawConfig.targets); const model = readOptionalString(rawConfig.model, 'model'); const agentOptions = readOptionalRecord(rawConfig.agent_options ?? rawConfig.agentOptions); - const evals = readOptionalStringOrStringArray(rawConfig.evals, 'evals'); + const suites = readSuites(rawConfig.suites); const scripts = readScriptArray(rawConfig.scripts, 'scripts'); const repeat = readRepeat(rawConfig.repeat); const runs = readOptionalPositiveInteger(rawConfig.runs, 'runs'); @@ -215,7 +239,7 @@ export function normalizeExperimentConfig( ...(targets !== undefined && { targets }), ...(model !== undefined && { model }), ...(agentOptions !== undefined && { agentOptions }), - ...(evals !== undefined && { evals }), + ...(suites !== undefined && { suites }), ...(scripts !== undefined && { scripts }), ...(repeat !== undefined && { repeat }), ...(runs !== undefined && { runs }), @@ -257,7 +281,7 @@ export function buildExperimentArtifactMetadata( ...(config.target !== undefined && { target: config.target }), ...(targets && targets.length > 0 && { targets }), ...(config.model !== undefined && { model: config.model }), - ...(config.evals !== undefined && { evals: config.evals }), + ...(config.suites !== undefined && { suites: config.suites.map(toSuiteArtifactMetadata) }), ...(config.repeat !== undefined && { repeat: { count: config.repeat.count, @@ -276,6 +300,20 @@ export function buildExperimentArtifactMetadata( }; } +function toSuiteArtifactMetadata(suite: ExperimentSuiteRef): { + readonly ref: string; + readonly select?: { readonly test_ids: readonly string[] }; +} { + return { + ref: suite.ref, + ...(suite.select !== undefined && { + select: { + test_ids: suite.select.testIds, + }, + }), + }; +} + function readRepeat(raw: unknown): ExperimentRepeat | undefined { if (raw === undefined) { return undefined; @@ -325,6 +363,43 @@ function readTargets(raw: unknown): readonly ExperimentTargetRef[] | undefined { }); } +function readSuites(raw: unknown): readonly ExperimentSuiteRef[] | undefined { + if (raw === undefined) { + return undefined; + } + if (!Array.isArray(raw)) { + throw new Error('Experiment suites must be an array.'); + } + if (raw.length === 0) { + throw new Error('Experiment suites must not be empty.'); + } + return raw.map((entry, index): ExperimentSuiteRef => { + if (!isRecord(entry)) { + throw new Error(`Experiment suites[${index}] must be an object.`); + } + const ref = readRequiredString(entry.ref, `suites[${index}].ref`); + const select = readSuiteSelect(entry.select, `suites[${index}].select`); + return { + ref, + ...(select !== undefined && { select }), + }; + }); +} + +function readSuiteSelect(raw: unknown, location: string): ExperimentSuiteSelect | undefined { + if (raw === undefined) { + return undefined; + } + if (!isRecord(raw)) { + throw new Error(`Experiment ${location} must be an object.`); + } + const testIds = readOptionalStringArray(raw.test_ids ?? raw.testIds, `${location}.test_ids`); + if (testIds === undefined) { + throw new Error(`Experiment ${location}.test_ids is required when select is set.`); + } + return { testIds }; +} + function readScriptArray(raw: unknown, location: string): readonly ExperimentScript[] | undefined { if (raw === undefined) { return undefined; @@ -424,6 +499,20 @@ function readOptionalStringOrStringArray( throw new Error(`Experiment ${location} must be a string or string array.`); } +function readOptionalStringArray(raw: unknown, location: string): readonly string[] | undefined { + if (raw === undefined) { + return undefined; + } + if ( + Array.isArray(raw) && + raw.length > 0 && + raw.every((entry) => typeof entry === 'string' && entry.trim()) + ) { + return raw.map((entry) => entry.trim()); + } + throw new Error(`Experiment ${location} must be a non-empty string array.`); +} + function readOptionalString(raw: unknown, location: string): string | undefined { if (raw === undefined) { return undefined; diff --git a/packages/core/src/evaluation/validation/experiment-file.schema.ts b/packages/core/src/evaluation/validation/experiment-file.schema.ts index 8a608753d..40f24c0d0 100644 --- a/packages/core/src/evaluation/validation/experiment-file.schema.ts +++ b/packages/core/src/evaluation/validation/experiment-file.schema.ts @@ -48,6 +48,19 @@ const ExperimentTargetRefSchema = z.union([ .strict(), ]); +const ExperimentSuiteSelectSchema = z + .object({ + test_ids: z.array(z.string().min(1)).min(1), + }) + .strict(); + +const ExperimentSuiteRefSchema = z + .object({ + ref: z.string().min(1), + select: ExperimentSuiteSelectSchema.optional(), + }) + .strict(); + export const ExperimentFileSchema = z .object({ name: z.string().min(1).optional(), @@ -56,7 +69,7 @@ export const ExperimentFileSchema = z targets: z.array(ExperimentTargetRefSchema).min(1).optional(), model: z.string().min(1).optional(), agent_options: JsonObjectSchema.optional(), - evals: StringOrStringArraySchema.optional(), + suites: z.array(ExperimentSuiteRefSchema).min(1).optional(), scripts: z.array(ExperimentScriptSchema).optional(), repeat: ExperimentRepeatSchema.optional(), runs: z.number().int().min(1).optional(), diff --git a/packages/core/test/evaluation/experiment.test.ts b/packages/core/test/evaluation/experiment.test.ts index be90638e7..e9cecddf9 100644 --- a/packages/core/test/evaluation/experiment.test.ts +++ b/packages/core/test/evaluation/experiment.test.ts @@ -19,7 +19,12 @@ describe('experiment config', () => { agent: 'codex', model: 'openai/gpt-5.5', agent_options: { reasoning_effort: 'high' }, - evals: 'evals/**/*.eval.yaml', + suites: [ + { + ref: 'evals/support.eval.yaml', + select: { test_ids: ['refund-eligibility', 'missing-order-date'] }, + }, + ], scripts: ['build', { script: 'bun test', timeout_seconds: 120 }], runs: 3, early_exit: false, @@ -36,7 +41,12 @@ describe('experiment config', () => { agent: 'codex', model: 'openai/gpt-5.5', agentOptions: { reasoning_effort: 'high' }, - evals: 'evals/**/*.eval.yaml', + suites: [ + { + ref: 'evals/support.eval.yaml', + select: { testIds: ['refund-eligibility', 'missing-order-date'] }, + }, + ], scripts: [{ script: 'build' }, { script: 'bun test', timeoutSeconds: 120 }], runs: 3, earlyExit: false, @@ -65,6 +75,34 @@ describe('experiment config', () => { }); }); + it('normalizes suite references with suite-local test id selectors', () => { + const config = normalizeExperimentConfig({ + suites: [ + { + ref: 'evals/support-regression.eval.yaml', + select: { + test_ids: ['refund-eligibility', 'missing-order-date'], + }, + }, + { + ref: 'evals/billing-*.eval.yaml', + }, + ], + }); + + expect(config.suites).toEqual([ + { + ref: 'evals/support-regression.eval.yaml', + select: { + testIds: ['refund-eligibility', 'missing-order-date'], + }, + }, + { + ref: 'evals/billing-*.eval.yaml', + }, + ]); + }); + it('accepts the prerelease trials costLimitUsd spelling only inside repeat', () => { const config = normalizeExperimentConfig({ repeat: { @@ -123,6 +161,12 @@ describe('experiment config', () => { /repeat and runs/, ); expect(() => normalizeExperimentConfig({ sandbox: 'host' })).toThrow(/sandbox/); + expect(() => normalizeExperimentConfig({ suites: [] })).toThrow(/suites/); + expect(() => + normalizeExperimentConfig({ + suites: [{ ref: 'evals/support.eval.yaml', select: { test_ids: [] } }], + }), + ).toThrow(/suites\[0\]\.select\.test_ids/); }); it('builds safe snake_case artifact metadata', () => { @@ -132,6 +176,12 @@ describe('experiment config', () => { agent_options: { secret: 'not persisted' }, setup: [{ script: 'bun install' }], scripts: [{ script: 'bun test' }], + suites: [ + { + ref: 'evals/support.eval.yaml', + select: { test_ids: ['refund-*'] }, + }, + ], repeat: { count: 2, strategy: 'mean', cost_limit_usd: 0.5 }, early_exit: true, timeout_seconds: 120, @@ -143,6 +193,12 @@ describe('experiment config', () => { expect(metadata).toMatchObject({ name: 'baseline', target: 'codex', + suites: [ + { + ref: 'evals/support.eval.yaml', + select: { test_ids: ['refund-*'] }, + }, + ], repeat: { count: 2, strategy: 'mean', diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md index 950791bff..86427366d 100644 --- a/skills-data/agentv-eval-writer/SKILL.md +++ b/skills-data/agentv-eval-writer/SKILL.md @@ -20,8 +20,10 @@ Treat YAML as the canonical portable model. Prefer authoring `.eval.yaml` / `EVA Eval files define what is tested: prompts, datasets, assertions, and task fixtures. Experiment files define how those evals run: targets, setup, scripts, timeout, -sandbox, and repeat-run policy. Use `experiments/*.yaml` for committed run -configurations. +sandbox, suite selection, and repeat-run policy. Use `experiments/*.yaml` for +committed run configurations. In eval YAML, keep `tests[]` as the atomic eval +definition. In experiment YAML, reference eval suites with `suites[]` and select +suite-local tests with `select.test_ids[]`. Use `@agentv/sdk` for TypeScript helper imports. Do not use `@agentv/eval` for new evals, examples, scaffolds, or skill guidance; it was a deprecated compatibility package and has been removed from this repository. diff --git a/skills-data/agentv-eval-writer/references/experiment-schema.json b/skills-data/agentv-eval-writer/references/experiment-schema.json index 481dafc33..86414774c 100644 --- a/skills-data/agentv-eval-writer/references/experiment-schema.json +++ b/skills-data/agentv-eval-writer/references/experiment-schema.json @@ -60,21 +60,35 @@ "properties": {}, "additionalProperties": {} }, - "evals": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "array", - "items": { + "suites": { + "type": "array", + "items": { + "type": "object", + "properties": { + "ref": { "type": "string", "minLength": 1 }, - "minItems": 1 - } - ] + "select": { + "type": "object", + "properties": { + "test_ids": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + } + }, + "required": ["test_ids"], + "additionalProperties": false + } + }, + "required": ["ref"], + "additionalProperties": false + }, + "minItems": 1 }, "scripts": { "type": "array",