diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 5e2eb1058..6eeaff397 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1,11 +1,11 @@
 import { spawn } from 'node:child_process';
 import { constants, existsSync, mkdirSync } from 'node:fs';
 import { access, readFile } from 'node:fs/promises';
+import { createRequire } from 'node:module';
 import path from 'node:path';
 import { pathToFileURL } from 'node:url';
 
 import {
-  DEFAULT_EVAL_PATTERNS,
   DEFAULT_THRESHOLD,
   type EvalTargetRef,
   type EvalTest,
@@ -89,6 +89,10 @@ import type { TaskBundleTargetSelection } from './task-bundle.js';
 import { WipCheckpointLoop } from './wip-checkpoint.js';
 
 const DEFAULT_WORKERS = 3;
+const require = createRequire(import.meta.url);
+const micromatch = require('micromatch') as {
+  isMatch(id: string, pattern: string): boolean;
+};
 
 function shouldSkipExistingResultForResume(
   result: Pick<EvaluationResult, 'executionStatus'>,
@@ -156,6 +160,7 @@ interface NormalizedOptions {
   readonly experimentMetadata?: ExperimentArtifactMetadata;
   readonly experimentTargetRefs?: readonly EvalTargetRef[];
   readonly experimentTrialsConfig?: TrialsConfig;
+  readonly suiteFiltersByEvalFile?: ReadonlyMap<string, string | readonly string[]>;
   readonly budgetUsd?: number;
   readonly sourceMetadataByEvalFile?: ReadonlyMap<string, Record<string, unknown>>;
   readonly resultsOverrides?: ResultsPublishOverrides;
@@ -650,8 +655,6 @@ function applyExperimentOptions(
   const workspaceMode =
     options.workspaceMode ?? readExperimentWorkspaceMode(experiment.workspace?.mode);
   const workspacePath = options.workspacePath ?? readExperimentWorkspacePath(experiment.workspace);
-  const experimentFilter = normalizeExperimentCaseFilter(experiment.evals);
-
   return {
     ...options,
     target: options.target ?? (nextCliTargets.length === 1 ? nextCliTargets[0] : undefined),
@@ -660,7 +663,6 @@ function applyExperimentOptions(
     workers: options.workers ?? experiment.workers,
     workspaceMode: workspacePath ? 'static' : workspaceMode,
     workspacePath,
-    filter: options.filter ?? experimentFilter,
     budgetUsd: options.budgetUsd ?? experiment.budgetUsd,
     experimentConfig: experiment,
     experimentMetadata: buildExperimentArtifactMetadata(experiment),
@@ -724,16 +726,53 @@ function readExperimentWorkspacePath(
   return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined;
 }
 
-function normalizeExperimentCaseFilter(
-  evals: ExperimentConfig['evals'] | undefined,
-): NormalizedOptions['filter'] {
-  if (typeof evals === 'string') {
-    return evals;
-  }
-  if (!Array.isArray(evals)) {
+type ExperimentSuiteSelection = {
+  readonly testFiles: readonly string[];
+  readonly filtersByEvalFile: ReadonlyMap<string, string | readonly string[]>;
+};
+
+function matchesTestFilter(id: string, filter: string | readonly string[]): boolean {
+  return typeof filter === 'string'
+    ? micromatch.isMatch(id, filter)
+    : filter.some((pattern) => micromatch.isMatch(id, pattern));
+}
+
+async function resolveExperimentSuiteSelection(
+  suites: ExperimentConfig['suites'] | undefined,
+  cwd: string,
+): Promise<ExperimentSuiteSelection | undefined> {
+  if (!suites || suites.length === 0) {
     return undefined;
   }
-  return evals.length === 1 ? evals[0] : [...evals];
+
+  const testFiles = new Set<string>();
+  const selectedTestIdsByEvalFile = new Map<string, string[]>();
+
+  for (const suite of suites) {
+    const resolvedSuiteFiles = await resolveEvalPaths([suite.ref], cwd);
+    for (const testFilePath of resolvedSuiteFiles) {
+      const resolvedPath = path.resolve(testFilePath);
+      testFiles.add(resolvedPath);
+      if (suite.select?.testIds && suite.select.testIds.length > 0) {
+        const existing = selectedTestIdsByEvalFile.get(resolvedPath) ?? [];
+        selectedTestIdsByEvalFile.set(resolvedPath, [...existing, ...suite.select.testIds]);
+      }
+    }
+  }
+
+  const filtersByEvalFile = new Map<string, string | readonly string[]>();
+  for (const [testFilePath, testIds] of selectedTestIdsByEvalFile.entries()) {
+    const uniqueTestIds = [...new Set(testIds)];
+    filtersByEvalFile.set(
+      testFilePath,
+      uniqueTestIds.length === 1 ? uniqueTestIds[0] : uniqueTestIds,
+    );
+  }
+
+  return {
+    testFiles: [...testFiles],
+    filtersByEvalFile,
+  };
 }
 
 async function runExperimentSteps(params: {
@@ -992,6 +1031,7 @@ async function prepareFileMetadata(params: {
   readonly repoRoot: string;
   readonly cwd: string;
   readonly options: NormalizedOptions;
+  readonly suiteFilter?: string | readonly string[];
 }): Promise<{
   readonly testIds: readonly string[];
   readonly testCases: readonly EvalTest[];
@@ -1009,7 +1049,7 @@ async function prepareFileMetadata(params: {
     target: import('@agentv/core').ResolvedTarget,
   ) => import('@agentv/core').Provider;
 }> {
-  const { testFilePath, repoRoot, cwd, options } = params;
+  const { testFilePath, repoRoot, cwd, options, suiteFilter } = params;
 
   await ensureFileExists(testFilePath, 'Test file');
   await loadEnvFromHierarchy({
@@ -1023,16 +1063,20 @@ async function prepareFileMetadata(params: {
 
   const suite = await loadTestSuite(testFilePath, repoRoot, {
     verbose: options.verbose,
-    filter: options.filter,
+    filter: suiteFilter ?? options.filter,
     category,
   });
-  const testIds = suite.tests.map((value) => value.id);
+  const testCases =
+    suiteFilter && options.filter
+      ? suite.tests.filter((testCase) => matchesTestFilter(testCase.id, options.filter ?? ''))
+      : suite.tests;
+  const testIds = testCases.map((value) => value.id);
   const suiteTargets = suite.targets;
 
-  if (suite.tests.length === 0) {
+  if (testCases.length === 0) {
     return {
       testIds,
-      testCases: suite.tests,
+      testCases,
       selections: [],
       trialsConfig: options.experimentTrialsConfig,
       suiteTargets,
@@ -1195,7 +1239,7 @@ async function prepareFileMetadata(params: {
 
   return {
     testIds,
-    testCases: suite.tests,
+    testCases,
     selections,
     trialsConfig: options.experimentTrialsConfig,
     suiteTargets,
@@ -1529,11 +1573,15 @@ export async function runEvalCommand(
     experiment: resolvedExperiment.name,
   };
 
+  const suiteSelection = await resolveExperimentSuiteSelection(
+    options.experimentConfig?.suites,
+    cwd,
+  );
   const evalPathInputs =
     input.testFiles.length > 0
       ? [...input.testFiles]
-      : options.experimentConfig?.evals !== undefined
-        ? [...(yamlConfig?.eval_patterns ?? DEFAULT_EVAL_PATTERNS)]
+      : suiteSelection
+        ? [...suiteSelection.testFiles]
         : [];
   if (evalPathInputs.length === 0 && process.stdin.isTTY) {
     const { launchInteractiveWizard } = await import('./interactive.js');
@@ -1541,6 +1589,12 @@ export async function runEvalCommand(
     return undefined;
   }
   const resolvedTestFiles = await resolveEvalPaths(evalPathInputs, cwd);
+  options = {
+    ...options,
+    ...(suiteSelection !== undefined && {
+      suiteFiltersByEvalFile: suiteSelection.filtersByEvalFile,
+    }),
+  };
 
   if (!process.env.AGENTV_EXPERIMENT) {
     process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
@@ -1813,6 +1867,7 @@ export async function runEvalCommand(
       repoRoot,
       cwd,
       options,
+      suiteFilter: options.suiteFiltersByEvalFile?.get(path.resolve(testFilePath)),
     });
     fileMetadata.set(testFilePath, meta);
   }
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 083819931..f325ca537 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -518,7 +518,7 @@ describe('agentv eval CLI', () => {
     }
   }, 30_000);
 
-  it('runs a native experiment file with eval selection and run knobs', async () => {
+  it('runs a native experiment file with suite test selection and run knobs', async () => {
     const fixture = await createFixture();
     try {
       const experimentsDir = path.join(fixture.suiteDir, 'experiments');
@@ -548,7 +548,11 @@ describe('agentv eval CLI', () => {
         [
           'name: native-exp',
           'target: cli-target',
-          'evals: case-alpha',
+          'suites:',
+          '  - ref: sample.test.yaml',
+          '    select:',
+          '      test_ids:',
+          '        - case-alpha',
           'timeout_seconds: 12',
           'workers: 4',
           'repeat:',
@@ -580,6 +584,7 @@ describe('agentv eval CLI', () => {
         target: 'cli-target',
         agentTimeoutMs: 12000,
         maxConcurrency: 4,
+        evalCaseIds: ['case-alpha'],
         trials: {
           count: 2,
           strategy: 'mean',
@@ -599,7 +604,14 @@ describe('agentv eval CLI', () => {
         name: 'native-exp',
         source_path: experimentPath,
         target: 'cli-target',
-        evals: 'case-alpha',
+        suites: [
+          {
+            ref: 'sample.test.yaml',
+            select: {
+              test_ids: ['case-alpha'],
+            },
+          },
+        ],
         repeat: {
           count: 2,
           strategy: 'mean',
diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
index 5e6f6a63b..de6a5b80b 100644
--- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
@@ -16,7 +16,12 @@ Committed experiments conventionally live under `experiments/`:
 ```yaml
 name: baseline
 target: codex-gpt5
-evals: "agent-*"
+suites:
+  - ref: evals/support-regression.eval.yaml
+    select:
+      test_ids:
+        - refund-eligibility
+        - missing-order-date
 timeout_seconds: 720
 repeat:
   count: 4
@@ -31,6 +36,30 @@ scripts:
 Wire fields use `snake_case`. AgentV translates to internal `camelCase` when it
 loads the file.
 
+## Suites and test selection
+
+Eval files keep `tests[]` as the canonical atomic test definition. Experiments
+reference one or more reusable eval suites through `suites[]`:
+
+```yaml
+suites:
+  - ref: evals/support-regression.eval.yaml
+  - ref: evals/billing-*.eval.yaml
+```
+
+Use suite-local `select.test_ids[]` to run only specific tests from a suite. The
+values match `tests[].id` inside that suite and use the same glob semantics as
+`--test-id`:
+
+```yaml
+suites:
+  - ref: evals/support-regression.eval.yaml
+    select:
+      test_ids:
+        - refund-*
+        - missing-order-date
+```
+
 ## Repeat runs
 
 `repeat` is the full AgentV replacement for the old eval-level
@@ -128,7 +157,7 @@ scripts:
 Run a specific experiment:
 
 ```bash
-bun agentv eval evals/suite.eval.yaml --experiment experiments/default.yaml
+bun agentv eval --experiment experiments/default.yaml
 ```
 
 If no experiment is passed, AgentV checks `.agentv/config.yaml` for a default:
diff --git a/examples/features/trials/README.md b/examples/features/trials/README.md
index e1a3e161a..12241151f 100644
--- a/examples/features/trials/README.md
+++ b/examples/features/trials/README.md
@@ -14,8 +14,7 @@ behavior in committed experiment files.
 ## Run
 
 ```bash
-bun agentv eval examples/features/trials/evals/dataset.eval.yaml \
-  --experiment examples/features/trials/experiments/default.yaml
+bun agentv eval --experiment examples/features/trials/experiments/default.yaml
 ```
 
 Swap the experiment path to try the other strategies.
diff --git a/examples/features/trials/experiments/confidence-interval.yaml b/examples/features/trials/experiments/confidence-interval.yaml
index 27a9b915d..50199244d 100644
--- a/examples/features/trials/experiments/confidence-interval.yaml
+++ b/examples/features/trials/experiments/confidence-interval.yaml
@@ -1,8 +1,11 @@
 name: trials-confidence-interval
 target: llm
-evals:
-  - math-basics
-  - capital-knowledge
+suites:
+  - ref: examples/features/trials/evals/dataset.eval.yaml
+    select:
+      test_ids:
+        - math-basics
+        - capital-knowledge
 repeat:
   count: 5
   strategy: confidence_interval
diff --git a/examples/features/trials/experiments/default.yaml b/examples/features/trials/experiments/default.yaml
index af9dcb477..41f173be1 100644
--- a/examples/features/trials/experiments/default.yaml
+++ b/examples/features/trials/experiments/default.yaml
@@ -1,8 +1,11 @@
 name: trials
 target: llm
-evals:
-  - math-basics
-  - capital-knowledge
+suites:
+  - ref: examples/features/trials/evals/dataset.eval.yaml
+    select:
+      test_ids:
+        - math-basics
+        - capital-knowledge
 repeat:
   count: 2
   strategy: pass_at_k
diff --git a/examples/features/trials/experiments/mean.yaml b/examples/features/trials/experiments/mean.yaml
index 1c534a8b9..fab967b8f 100644
--- a/examples/features/trials/experiments/mean.yaml
+++ b/examples/features/trials/experiments/mean.yaml
@@ -1,8 +1,11 @@
 name: trials-mean
 target: llm
-evals:
-  - math-basics
-  - capital-knowledge
+suites:
+  - ref: examples/features/trials/evals/dataset.eval.yaml
+    select:
+      test_ids:
+        - math-basics
+        - capital-knowledge
 repeat:
   count: 3
   strategy: mean
diff --git a/examples/showcase/multi-model-benchmark/README.md b/examples/showcase/multi-model-benchmark/README.md
index 3ad5085ea..b22675a9c 100644
--- a/examples/showcase/multi-model-benchmark/README.md
+++ b/examples/showcase/multi-model-benchmark/README.md
@@ -37,8 +37,7 @@ From the repository root:
 
 ```bash
 # Run the full matrix (all targets × all tests × 2 repeat attempts)
-bun agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml \
-  --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml
+bun agentv eval --experiment examples/showcase/multi-model-benchmark/experiments/default.yaml
 ```
 
 ### Cost & Safety
@@ -132,11 +131,14 @@ targets:
   - copilot
   - claude
   - gemini-llm
-evals:
-  - factual-*
-  - analytical-comparison
-  - creative-explanation
-  - structured-list
+suites:
+  - ref: examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
+    select:
+      test_ids:
+        - factual-*
+        - analytical-comparison
+        - creative-explanation
+        - structured-list
 repeat:
   count: 2
   strategy: pass_at_k
diff --git a/examples/showcase/multi-model-benchmark/experiments/default.yaml b/examples/showcase/multi-model-benchmark/experiments/default.yaml
index 007167966..cb941192e 100644
--- a/examples/showcase/multi-model-benchmark/experiments/default.yaml
+++ b/examples/showcase/multi-model-benchmark/experiments/default.yaml
@@ -3,12 +3,15 @@ targets:
   - copilot
   - claude
   - gemini-llm
-evals:
-  - factual-geography
-  - factual-science
-  - analytical-comparison
-  - creative-explanation
-  - structured-list
+suites:
+  - ref: examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
+    select:
+      test_ids:
+        - factual-geography
+        - factual-science
+        - analytical-comparison
+        - creative-explanation
+        - structured-list
 repeat:
   count: 2
   strategy: pass_at_k
diff --git a/packages/core/src/evaluation/experiment.ts b/packages/core/src/evaluation/experiment.ts
index 2d1a512a8..a572511b3 100644
--- a/packages/core/src/evaluation/experiment.ts
+++ b/packages/core/src/evaluation/experiment.ts
@@ -58,6 +58,25 @@ export type ExperimentRepeat = {
   readonly costLimitUsd?: number;
 };
 
+export type ExperimentSuiteSelectWire = {
+  readonly test_ids?: readonly string[];
+  readonly testIds?: readonly string[];
+};
+
+export type ExperimentSuiteSelect = {
+  readonly testIds: readonly string[];
+};
+
+export type ExperimentSuiteRefWire = {
+  readonly ref: string;
+  readonly select?: ExperimentSuiteSelectWire;
+};
+
+export type ExperimentSuiteRef = {
+  readonly ref: string;
+  readonly select?: ExperimentSuiteSelect;
+};
+
 export type ExperimentConfigWire = {
   readonly name?: string;
   readonly agent?: string;
@@ -65,7 +84,7 @@ export type ExperimentConfigWire = {
   readonly targets?: readonly ExperimentTargetRefWire[];
   readonly model?: string;
   readonly agent_options?: Record<string, unknown>;
-  readonly evals?: string | readonly string[];
+  readonly suites?: readonly ExperimentSuiteRefWire[];
   readonly scripts?: readonly ExperimentScriptWire[];
   readonly repeat?: ExperimentRepeatWire;
   readonly runs?: number;
@@ -85,7 +104,7 @@ export type ExperimentConfig = {
   readonly targets?: readonly ExperimentTargetRef[];
   readonly model?: string;
   readonly agentOptions?: Record<string, unknown>;
-  readonly evals?: string | readonly string[];
+  readonly suites?: readonly ExperimentSuiteRef[];
   readonly scripts?: readonly ExperimentScript[];
   readonly repeat?: ExperimentRepeat;
   readonly runs?: number;
@@ -108,7 +127,12 @@ export type ExperimentArtifactMetadata = {
   readonly target?: string;
   readonly targets?: readonly string[];
   readonly model?: string;
-  readonly evals?: string | readonly string[];
+  readonly suites?: readonly {
+    readonly ref: string;
+    readonly select?: {
+      readonly test_ids: readonly string[];
+    };
+  }[];
   readonly repeat?: {
     readonly count: number;
     readonly strategy: TrialStrategy;
@@ -187,7 +211,7 @@ export function normalizeExperimentConfig(
   const targets = readTargets(rawConfig.targets);
   const model = readOptionalString(rawConfig.model, 'model');
   const agentOptions = readOptionalRecord(rawConfig.agent_options ?? rawConfig.agentOptions);
-  const evals = readOptionalStringOrStringArray(rawConfig.evals, 'evals');
+  const suites = readSuites(rawConfig.suites);
   const scripts = readScriptArray(rawConfig.scripts, 'scripts');
   const repeat = readRepeat(rawConfig.repeat);
   const runs = readOptionalPositiveInteger(rawConfig.runs, 'runs');
@@ -215,7 +239,7 @@ export function normalizeExperimentConfig(
     ...(targets !== undefined && { targets }),
     ...(model !== undefined && { model }),
     ...(agentOptions !== undefined && { agentOptions }),
-    ...(evals !== undefined && { evals }),
+    ...(suites !== undefined && { suites }),
     ...(scripts !== undefined && { scripts }),
     ...(repeat !== undefined && { repeat }),
     ...(runs !== undefined && { runs }),
@@ -257,7 +281,7 @@ export function buildExperimentArtifactMetadata(
     ...(config.target !== undefined && { target: config.target }),
     ...(targets && targets.length > 0 && { targets }),
     ...(config.model !== undefined && { model: config.model }),
-    ...(config.evals !== undefined && { evals: config.evals }),
+    ...(config.suites !== undefined && { suites: config.suites.map(toSuiteArtifactMetadata) }),
     ...(config.repeat !== undefined && {
       repeat: {
         count: config.repeat.count,
@@ -276,6 +300,20 @@ export function buildExperimentArtifactMetadata(
   };
 }
 
+function toSuiteArtifactMetadata(suite: ExperimentSuiteRef): {
+  readonly ref: string;
+  readonly select?: { readonly test_ids: readonly string[] };
+} {
+  return {
+    ref: suite.ref,
+    ...(suite.select !== undefined && {
+      select: {
+        test_ids: suite.select.testIds,
+      },
+    }),
+  };
+}
+
 function readRepeat(raw: unknown): ExperimentRepeat | undefined {
   if (raw === undefined) {
     return undefined;
@@ -325,6 +363,43 @@ function readTargets(raw: unknown): readonly ExperimentTargetRef[] | undefined {
   });
 }
 
+function readSuites(raw: unknown): readonly ExperimentSuiteRef[] | undefined {
+  if (raw === undefined) {
+    return undefined;
+  }
+  if (!Array.isArray(raw)) {
+    throw new Error('Experiment suites must be an array.');
+  }
+  if (raw.length === 0) {
+    throw new Error('Experiment suites must not be empty.');
+  }
+  return raw.map((entry, index): ExperimentSuiteRef => {
+    if (!isRecord(entry)) {
+      throw new Error(`Experiment suites[${index}] must be an object.`);
+    }
+    const ref = readRequiredString(entry.ref, `suites[${index}].ref`);
+    const select = readSuiteSelect(entry.select, `suites[${index}].select`);
+    return {
+      ref,
+      ...(select !== undefined && { select }),
+    };
+  });
+}
+
+function readSuiteSelect(raw: unknown, location: string): ExperimentSuiteSelect | undefined {
+  if (raw === undefined) {
+    return undefined;
+  }
+  if (!isRecord(raw)) {
+    throw new Error(`Experiment ${location} must be an object.`);
+  }
+  const testIds = readOptionalStringArray(raw.test_ids ?? raw.testIds, `${location}.test_ids`);
+  if (testIds === undefined) {
+    throw new Error(`Experiment ${location}.test_ids is required when select is set.`);
+  }
+  return { testIds };
+}
+
 function readScriptArray(raw: unknown, location: string): readonly ExperimentScript[] | undefined {
   if (raw === undefined) {
     return undefined;
@@ -424,6 +499,20 @@ function readOptionalStringOrStringArray(
   throw new Error(`Experiment ${location} must be a string or string array.`);
 }
 
+function readOptionalStringArray(raw: unknown, location: string): readonly string[] | undefined {
+  if (raw === undefined) {
+    return undefined;
+  }
+  if (
+    Array.isArray(raw) &&
+    raw.length > 0 &&
+    raw.every((entry) => typeof entry === 'string' && entry.trim())
+  ) {
+    return raw.map((entry) => entry.trim());
+  }
+  throw new Error(`Experiment ${location} must be a non-empty string array.`);
+}
+
 function readOptionalString(raw: unknown, location: string): string | undefined {
   if (raw === undefined) {
     return undefined;
diff --git a/packages/core/src/evaluation/validation/experiment-file.schema.ts b/packages/core/src/evaluation/validation/experiment-file.schema.ts
index 8a608753d..40f24c0d0 100644
--- a/packages/core/src/evaluation/validation/experiment-file.schema.ts
+++ b/packages/core/src/evaluation/validation/experiment-file.schema.ts
@@ -48,6 +48,19 @@ const ExperimentTargetRefSchema = z.union([
     .strict(),
 ]);
 
+const ExperimentSuiteSelectSchema = z
+  .object({
+    test_ids: z.array(z.string().min(1)).min(1),
+  })
+  .strict();
+
+const ExperimentSuiteRefSchema = z
+  .object({
+    ref: z.string().min(1),
+    select: ExperimentSuiteSelectSchema.optional(),
+  })
+  .strict();
+
 export const ExperimentFileSchema = z
   .object({
     name: z.string().min(1).optional(),
@@ -56,7 +69,7 @@ export const ExperimentFileSchema = z
     targets: z.array(ExperimentTargetRefSchema).min(1).optional(),
     model: z.string().min(1).optional(),
     agent_options: JsonObjectSchema.optional(),
-    evals: StringOrStringArraySchema.optional(),
+    suites: z.array(ExperimentSuiteRefSchema).min(1).optional(),
     scripts: z.array(ExperimentScriptSchema).optional(),
     repeat: ExperimentRepeatSchema.optional(),
     runs: z.number().int().min(1).optional(),
diff --git a/packages/core/test/evaluation/experiment.test.ts b/packages/core/test/evaluation/experiment.test.ts
index be90638e7..e9cecddf9 100644
--- a/packages/core/test/evaluation/experiment.test.ts
+++ b/packages/core/test/evaluation/experiment.test.ts
@@ -19,7 +19,12 @@ describe('experiment config', () => {
       agent: 'codex',
       model: 'openai/gpt-5.5',
       agent_options: { reasoning_effort: 'high' },
-      evals: 'evals/**/*.eval.yaml',
+      suites: [
+        {
+          ref: 'evals/support.eval.yaml',
+          select: { test_ids: ['refund-eligibility', 'missing-order-date'] },
+        },
+      ],
       scripts: ['build', { script: 'bun test', timeout_seconds: 120 }],
       runs: 3,
       early_exit: false,
@@ -36,7 +41,12 @@ describe('experiment config', () => {
       agent: 'codex',
       model: 'openai/gpt-5.5',
       agentOptions: { reasoning_effort: 'high' },
-      evals: 'evals/**/*.eval.yaml',
+      suites: [
+        {
+          ref: 'evals/support.eval.yaml',
+          select: { testIds: ['refund-eligibility', 'missing-order-date'] },
+        },
+      ],
       scripts: [{ script: 'build' }, { script: 'bun test', timeoutSeconds: 120 }],
       runs: 3,
       earlyExit: false,
@@ -65,6 +75,34 @@ describe('experiment config', () => {
     });
   });
 
+  it('normalizes suite references with suite-local test id selectors', () => {
+    const config = normalizeExperimentConfig({
+      suites: [
+        {
+          ref: 'evals/support-regression.eval.yaml',
+          select: {
+            test_ids: ['refund-eligibility', 'missing-order-date'],
+          },
+        },
+        {
+          ref: 'evals/billing-*.eval.yaml',
+        },
+      ],
+    });
+
+    expect(config.suites).toEqual([
+      {
+        ref: 'evals/support-regression.eval.yaml',
+        select: {
+          testIds: ['refund-eligibility', 'missing-order-date'],
+        },
+      },
+      {
+        ref: 'evals/billing-*.eval.yaml',
+      },
+    ]);
+  });
+
   it('accepts the prerelease trials costLimitUsd spelling only inside repeat', () => {
     const config = normalizeExperimentConfig({
       repeat: {
@@ -123,6 +161,12 @@ describe('experiment config', () => {
       /repeat and runs/,
     );
     expect(() => normalizeExperimentConfig({ sandbox: 'host' })).toThrow(/sandbox/);
+    expect(() => normalizeExperimentConfig({ suites: [] })).toThrow(/suites/);
+    expect(() =>
+      normalizeExperimentConfig({
+        suites: [{ ref: 'evals/support.eval.yaml', select: { test_ids: [] } }],
+      }),
+    ).toThrow(/suites\[0\]\.select\.test_ids/);
   });
 
   it('builds safe snake_case artifact metadata', () => {
@@ -132,6 +176,12 @@ describe('experiment config', () => {
       agent_options: { secret: 'not persisted' },
       setup: [{ script: 'bun install' }],
       scripts: [{ script: 'bun test' }],
+      suites: [
+        {
+          ref: 'evals/support.eval.yaml',
+          select: { test_ids: ['refund-*'] },
+        },
+      ],
       repeat: { count: 2, strategy: 'mean', cost_limit_usd: 0.5 },
       early_exit: true,
       timeout_seconds: 120,
@@ -143,6 +193,12 @@ describe('experiment config', () => {
     expect(metadata).toMatchObject({
       name: 'baseline',
       target: 'codex',
+      suites: [
+        {
+          ref: 'evals/support.eval.yaml',
+          select: { test_ids: ['refund-*'] },
+        },
+      ],
       repeat: {
         count: 2,
         strategy: 'mean',
diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md
index 950791bff..86427366d 100644
--- a/skills-data/agentv-eval-writer/SKILL.md
+++ b/skills-data/agentv-eval-writer/SKILL.md
@@ -20,8 +20,10 @@ Treat YAML as the canonical portable model. Prefer authoring `.eval.yaml` / `EVA
 
 Eval files define what is tested: prompts, datasets, assertions, and task fixtures.
 Experiment files define how those evals run: targets, setup, scripts, timeout,
-sandbox, and repeat-run policy. Use `experiments/*.yaml` for committed run
-configurations.
+sandbox, suite selection, and repeat-run policy. Use `experiments/*.yaml` for
+committed run configurations. In eval YAML, keep `tests[]` as the atomic eval
+definition. In experiment YAML, reference eval suites with `suites[]` and select
+suite-local tests with `select.test_ids[]`.
 
 Use `@agentv/sdk` for TypeScript helper imports. Do not use `@agentv/eval` for new evals, examples, scaffolds, or skill guidance; it was a deprecated compatibility package and has been removed from this repository.
 
diff --git a/skills-data/agentv-eval-writer/references/experiment-schema.json b/skills-data/agentv-eval-writer/references/experiment-schema.json
index 481dafc33..86414774c 100644
--- a/skills-data/agentv-eval-writer/references/experiment-schema.json
+++ b/skills-data/agentv-eval-writer/references/experiment-schema.json
@@ -60,21 +60,35 @@
           "properties": {},
           "additionalProperties": {}
         },
-        "evals": {
-          "anyOf": [
-            {
-              "type": "string",
-              "minLength": 1
-            },
-            {
-              "type": "array",
-              "items": {
+        "suites": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "ref": {
                 "type": "string",
                 "minLength": 1
               },
-              "minItems": 1
-            }
-          ]
+              "select": {
+                "type": "object",
+                "properties": {
+                  "test_ids": {
+                    "type": "array",
+                    "items": {
+                      "type": "string",
+                      "minLength": 1
+                    },
+                    "minItems": 1
+                  }
+                },
+                "required": ["test_ids"],
+                "additionalProperties": false
+              }
+            },
+            "required": ["ref"],
+            "additionalProperties": false
+          },
+          "minItems": 1
         },
         "scripts": {
           "type": "array",