Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 75 additions & 20 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import { spawn } from 'node:child_process';
import { constants, existsSync, mkdirSync } from 'node:fs';
import { access, readFile } from 'node:fs/promises';
import { createRequire } from 'node:module';
import path from 'node:path';
import { pathToFileURL } from 'node:url';

import {
DEFAULT_EVAL_PATTERNS,
DEFAULT_THRESHOLD,
type EvalTargetRef,
type EvalTest,
Expand Down Expand Up @@ -89,6 +89,10 @@ import type { TaskBundleTargetSelection } from './task-bundle.js';
import { WipCheckpointLoop } from './wip-checkpoint.js';

const DEFAULT_WORKERS = 3;
const require = createRequire(import.meta.url);
const micromatch = require('micromatch') as {
isMatch(id: string, pattern: string): boolean;
};

function shouldSkipExistingResultForResume(
result: Pick<EvaluationResult, 'executionStatus'>,
Expand Down Expand Up @@ -156,6 +160,7 @@ interface NormalizedOptions {
readonly experimentMetadata?: ExperimentArtifactMetadata;
readonly experimentTargetRefs?: readonly EvalTargetRef[];
readonly experimentTrialsConfig?: TrialsConfig;
readonly suiteFiltersByEvalFile?: ReadonlyMap<string, string | readonly string[]>;
readonly budgetUsd?: number;
readonly sourceMetadataByEvalFile?: ReadonlyMap<string, Record<string, unknown>>;
readonly resultsOverrides?: ResultsPublishOverrides;
Expand Down Expand Up @@ -650,8 +655,6 @@ function applyExperimentOptions(
const workspaceMode =
options.workspaceMode ?? readExperimentWorkspaceMode(experiment.workspace?.mode);
const workspacePath = options.workspacePath ?? readExperimentWorkspacePath(experiment.workspace);
const experimentFilter = normalizeExperimentCaseFilter(experiment.evals);

return {
...options,
target: options.target ?? (nextCliTargets.length === 1 ? nextCliTargets[0] : undefined),
Expand All @@ -660,7 +663,6 @@ function applyExperimentOptions(
workers: options.workers ?? experiment.workers,
workspaceMode: workspacePath ? 'static' : workspaceMode,
workspacePath,
filter: options.filter ?? experimentFilter,
budgetUsd: options.budgetUsd ?? experiment.budgetUsd,
experimentConfig: experiment,
experimentMetadata: buildExperimentArtifactMetadata(experiment),
Expand Down Expand Up @@ -724,16 +726,53 @@ function readExperimentWorkspacePath(
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined;
}

function normalizeExperimentCaseFilter(
evals: ExperimentConfig['evals'] | undefined,
): NormalizedOptions['filter'] {
if (typeof evals === 'string') {
return evals;
}
if (!Array.isArray(evals)) {
type ExperimentSuiteSelection = {
readonly testFiles: readonly string[];
readonly filtersByEvalFile: ReadonlyMap<string, string | readonly string[]>;
};

function matchesTestFilter(id: string, filter: string | readonly string[]): boolean {
return typeof filter === 'string'
? micromatch.isMatch(id, filter)
: filter.some((pattern) => micromatch.isMatch(id, pattern));
}

async function resolveExperimentSuiteSelection(
suites: ExperimentConfig['suites'] | undefined,
cwd: string,
): Promise<ExperimentSuiteSelection | undefined> {
if (!suites || suites.length === 0) {
return undefined;
}
return evals.length === 1 ? evals[0] : [...evals];

const testFiles = new Set<string>();
const selectedTestIdsByEvalFile = new Map<string, string[]>();

for (const suite of suites) {
const resolvedSuiteFiles = await resolveEvalPaths([suite.ref], cwd);
for (const testFilePath of resolvedSuiteFiles) {
const resolvedPath = path.resolve(testFilePath);
testFiles.add(resolvedPath);
if (suite.select?.testIds && suite.select.testIds.length > 0) {
const existing = selectedTestIdsByEvalFile.get(resolvedPath) ?? [];
selectedTestIdsByEvalFile.set(resolvedPath, [...existing, ...suite.select.testIds]);
}
}
}

const filtersByEvalFile = new Map<string, string | readonly string[]>();
for (const [testFilePath, testIds] of selectedTestIdsByEvalFile.entries()) {
const uniqueTestIds = [...new Set(testIds)];
filtersByEvalFile.set(
testFilePath,
uniqueTestIds.length === 1 ? uniqueTestIds[0] : uniqueTestIds,
);
}

return {
testFiles: [...testFiles],
filtersByEvalFile,
};
}

async function runExperimentSteps(params: {
Expand Down Expand Up @@ -992,6 +1031,7 @@ async function prepareFileMetadata(params: {
readonly repoRoot: string;
readonly cwd: string;
readonly options: NormalizedOptions;
readonly suiteFilter?: string | readonly string[];
}): Promise<{
readonly testIds: readonly string[];
readonly testCases: readonly EvalTest[];
Expand All @@ -1009,7 +1049,7 @@ async function prepareFileMetadata(params: {
target: import('@agentv/core').ResolvedTarget,
) => import('@agentv/core').Provider;
}> {
const { testFilePath, repoRoot, cwd, options } = params;
const { testFilePath, repoRoot, cwd, options, suiteFilter } = params;

await ensureFileExists(testFilePath, 'Test file');
await loadEnvFromHierarchy({
Expand All @@ -1023,16 +1063,20 @@ async function prepareFileMetadata(params: {

const suite = await loadTestSuite(testFilePath, repoRoot, {
verbose: options.verbose,
filter: options.filter,
filter: suiteFilter ?? options.filter,
category,
});
const testIds = suite.tests.map((value) => value.id);
const testCases =
suiteFilter && options.filter
? suite.tests.filter((testCase) => matchesTestFilter(testCase.id, options.filter ?? ''))
: suite.tests;
const testIds = testCases.map((value) => value.id);
const suiteTargets = suite.targets;

if (suite.tests.length === 0) {
if (testCases.length === 0) {
return {
testIds,
testCases: suite.tests,
testCases,
selections: [],
trialsConfig: options.experimentTrialsConfig,
suiteTargets,
Expand Down Expand Up @@ -1195,7 +1239,7 @@ async function prepareFileMetadata(params: {

return {
testIds,
testCases: suite.tests,
testCases,
selections,
trialsConfig: options.experimentTrialsConfig,
suiteTargets,
Expand Down Expand Up @@ -1529,18 +1573,28 @@ export async function runEvalCommand(
experiment: resolvedExperiment.name,
};

const suiteSelection = await resolveExperimentSuiteSelection(
options.experimentConfig?.suites,
cwd,
);
const evalPathInputs =
input.testFiles.length > 0
? [...input.testFiles]
: options.experimentConfig?.evals !== undefined
? [...(yamlConfig?.eval_patterns ?? DEFAULT_EVAL_PATTERNS)]
: suiteSelection
? [...suiteSelection.testFiles]
: [];
if (evalPathInputs.length === 0 && process.stdin.isTTY) {
const { launchInteractiveWizard } = await import('./interactive.js');
await launchInteractiveWizard();
return undefined;
}
const resolvedTestFiles = await resolveEvalPaths(evalPathInputs, cwd);
options = {
...options,
...(suiteSelection !== undefined && {
suiteFiltersByEvalFile: suiteSelection.filtersByEvalFile,
}),
};

if (!process.env.AGENTV_EXPERIMENT) {
process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
Expand Down Expand Up @@ -1813,6 +1867,7 @@ export async function runEvalCommand(
repoRoot,
cwd,
options,
suiteFilter: options.suiteFiltersByEvalFile?.get(path.resolve(testFilePath)),
});
fileMetadata.set(testFilePath, meta);
}
Expand Down
18 changes: 15 additions & 3 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ describe('agentv eval CLI', () => {
}
}, 30_000);

it('runs a native experiment file with eval selection and run knobs', async () => {
it('runs a native experiment file with suite test selection and run knobs', async () => {
const fixture = await createFixture();
try {
const experimentsDir = path.join(fixture.suiteDir, 'experiments');
Expand Down Expand Up @@ -548,7 +548,11 @@ describe('agentv eval CLI', () => {
[
'name: native-exp',
'target: cli-target',
'evals: case-alpha',
'suites:',
' - ref: sample.test.yaml',
' select:',
' test_ids:',
' - case-alpha',
'timeout_seconds: 12',
'workers: 4',
'repeat:',
Expand Down Expand Up @@ -580,6 +584,7 @@ describe('agentv eval CLI', () => {
target: 'cli-target',
agentTimeoutMs: 12000,
maxConcurrency: 4,
evalCaseIds: ['case-alpha'],
trials: {
count: 2,
strategy: 'mean',
Expand All @@ -599,7 +604,14 @@ describe('agentv eval CLI', () => {
name: 'native-exp',
source_path: experimentPath,
target: 'cli-target',
evals: 'case-alpha',
suites: [
{
ref: 'sample.test.yaml',
select: {
test_ids: ['case-alpha'],
},
},
],
repeat: {
count: 2,
strategy: 'mean',
Expand Down
33 changes: 31 additions & 2 deletions apps/web/src/content/docs/docs/evaluation/experiments.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@ Committed experiments conventionally live under `experiments/`:
```yaml
name: baseline
target: codex-gpt5
evals: "agent-*"
suites:
- ref: evals/support-regression.eval.yaml
select:
test_ids:
- refund-eligibility
- missing-order-date
timeout_seconds: 720
repeat:
count: 4
Expand All @@ -31,6 +36,30 @@ scripts:
Wire fields use `snake_case`. AgentV translates to internal `camelCase` when it
loads the file.

## Suites and test selection

Eval files keep `tests[]` as the canonical atomic test definition. Experiments
reference one or more reusable eval suites through `suites[]`:

```yaml
suites:
- ref: evals/support-regression.eval.yaml
- ref: evals/billing-*.eval.yaml
```

Use suite-local `select.test_ids[]` to run only specific tests from a suite. The
values match `tests[].id` inside that suite and use the same glob semantics as
`--test-id`:

```yaml
suites:
- ref: evals/support-regression.eval.yaml
select:
test_ids:
- refund-*
- missing-order-date
```

## Repeat runs

`repeat` is the full AgentV replacement for the old eval-level
Expand Down Expand Up @@ -128,7 +157,7 @@ scripts:
Run a specific experiment:

```bash
bun agentv eval evals/suite.eval.yaml --experiment experiments/default.yaml
bun agentv eval --experiment experiments/default.yaml
```

If no experiment is passed, AgentV checks `.agentv/config.yaml` for a default:
Expand Down
3 changes: 1 addition & 2 deletions examples/features/trials/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ behavior in committed experiment files.
## Run

```bash
bun agentv eval examples/features/trials/evals/dataset.eval.yaml \
--experiment examples/features/trials/experiments/default.yaml
bun agentv eval --experiment examples/features/trials/experiments/default.yaml
```

Swap the experiment path to try the other strategies.
Expand Down
9 changes: 6 additions & 3 deletions examples/features/trials/experiments/confidence-interval.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
name: trials-confidence-interval
target: llm
evals:
- math-basics
- capital-knowledge
suites:
- ref: examples/features/trials/evals/dataset.eval.yaml
select:
test_ids:
- math-basics
- capital-knowledge
repeat:
count: 5
strategy: confidence_interval
Expand Down
9 changes: 6 additions & 3 deletions examples/features/trials/experiments/default.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
name: trials
target: llm
evals:
- math-basics
- capital-knowledge
suites:
- ref: examples/features/trials/evals/dataset.eval.yaml
select:
test_ids:
- math-basics
- capital-knowledge
repeat:
count: 2
strategy: pass_at_k
Expand Down
9 changes: 6 additions & 3 deletions examples/features/trials/experiments/mean.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
name: trials-mean
target: llm
evals:
- math-basics
- capital-knowledge
suites:
- ref: examples/features/trials/evals/dataset.eval.yaml
select:
test_ids:
- math-basics
- capital-knowledge
repeat:
count: 3
strategy: mean
Expand Down
16 changes: 9 additions & 7 deletions examples/showcase/multi-model-benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ From the repository root:

```bash
# Run the full matrix (all targets × all tests × 2 repeat attempts)
bun agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml \
--experiment examples/showcase/multi-model-benchmark/experiments/default.yaml
bun agentv eval --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml
```

### Cost & Safety
Expand Down Expand Up @@ -132,11 +131,14 @@ targets:
- copilot
- claude
- gemini-llm
evals:
- factual-*
- analytical-comparison
- creative-explanation
- structured-list
suites:
- ref: examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
select:
test_ids:
- factual-*
- analytical-comparison
- creative-explanation
- structured-list
repeat:
count: 2
strategy: pass_at_k
Expand Down
Loading
Loading