diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 7f2c0b6..6d36a6b 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.99.0"
+version = "0.100.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
index 45ac544..dc207aa 100644
--- a/clients/python/src/agent_eval_rpc/__init__.py
+++ b/clients/python/src/agent_eval_rpc/__init__.py
@@ -58,7 +58,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.99.0"
+    __version__ = "0.100.0"
 
 __all__ = [
     "Client",
diff --git a/package.json b/package.json
index d463106..7cd0413 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.99.0",
+  "version": "0.100.0",
   "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
diff --git a/src/hidden-criteria-grading.test.ts b/src/hidden-criteria-grading.test.ts
new file mode 100644
index 0000000..8554076
--- /dev/null
+++ b/src/hidden-criteria-grading.test.ts
@@ -0,0 +1,214 @@
+import { describe, expect, it } from 'vitest'
+import type { JudgeScore } from './campaign/types'
+import { ValidationError } from './errors'
+import {
+  agentVisibleFields,
+  assertNoHiddenLeak,
+  blendHeldout,
+  defaultBlendWeights,
+  type FieldDestination,
+  gradeOnHidden,
+  type HiddenCriteriaGrader,
+  hiddenGrade,
+  isHiddenDestination,
+  routeFields,
+  withHeldoutBlend,
+} from './hidden-criteria-grading'
+
+// A NON-coding domain proves the firewall has no domain coupling: here it is a
+// legal-brief task. The agent sees the question + a sample citation; the hidden
+// "answer key" (the required holdings) and the rubric anchors are graded but
+// never reach the agent. Any domain plugs in its own grader the same way.
+const legalRouting = {
+  question: 'agent-visible',
+  sampleCitation: 'develop-against',
+  requiredHoldings: 'grading-only',
+  rubricNote: 'judge-only',
+} as const satisfies Record<string, FieldDestination>
+
+const legalValues = {
+  question: 'Draft a brief arguing the search violated the Fourth Amendment.',
+  sampleCitation: 'See Katz v. United States, 389 U.S. 347 (1967).',
+  requiredHoldings: 'Must cite Carpenter v. United States and the third-party-doctrine limit.',
+  rubricNote: 'Reward a clean reasonable-expectation-of-privacy framing.',
+} as const
+
+describe('field routing by destination', () => {
+  it('classifies grading-only and judge-only as hidden, the rest as visible', () => {
+    expect(isHiddenDestination('grading-only')).toBe(true)
+    expect(isHiddenDestination('judge-only')).toBe(true)
+    expect(isHiddenDestination('agent-visible')).toBe(false)
+    expect(isHiddenDestination('develop-against')).toBe(false)
+  })
+
+  it('routes a domain field map into RoutedFields', () => {
+    const fields = routeFields(legalRouting, legalValues)
+    expect(fields).toHaveLength(4)
+    expect(fields.find((f) => f.name === 'requiredHoldings')?.destination).toBe('grading-only')
+  })
+
+  it('fails loud when a routed field has no value', () => {
+    expect(() =>
+      routeFields({ a: 'agent-visible', b: 'grading-only' }, { a: 'present' } as unknown as Record<
+        'a' | 'b',
+        string
+      >),
+    ).toThrow(ValidationError)
+  })
+
+  it('agentVisibleFields keeps only the non-hidden fields', () => {
+    const visible = agentVisibleFields(routeFields(legalRouting, legalValues))
+    const names = visible.map((f) => f.name).sort()
+    expect(names).toEqual(['question', 'sampleCitation'])
+  })
+})
+
+describe('assertNoHiddenLeak — the firewall', () => {
+  const fields = routeFields(legalRouting, legalValues)
+
+  it('passes when the agent context holds only visible fields', () => {
+    const cleanContext = `${legalValues.question}\n${legalValues.sampleCitation}`
+    expect(() => assertNoHiddenLeak(fields, cleanContext)).not.toThrow()
+  })
+
+  it('REJECTS when a grading-only field reaches the agent context', () => {
+    const leakyContext = `${legalValues.question}\n${legalValues.requiredHoldings}`
+    let thrown: unknown
+    try {
+      assertNoHiddenLeak(fields, leakyContext)
+    } catch (err) {
+      thrown = err
+    }
+    expect(thrown).toBeInstanceOf(ValidationError)
+    expect((thrown as Error).message).toMatch(/requiredHoldings/)
+    expect((thrown as Error).message).toMatch(/grading-only/)
+  })
+
+  it('REJECTS when a judge-only field reaches the agent context', () => {
+    const leakyContext = `${legalValues.question}\n${legalValues.rubricNote}`
+    expect(() => assertNoHiddenLeak(fields, leakyContext)).toThrow(/judge-only/)
+  })
+
+  it('does not flag a develop-against field that appears in the context (intentional)', () => {
+    const tddContext = `${legalValues.question}\n${legalValues.sampleCitation}`
+    expect(() => assertNoHiddenLeak(fields, tddContext)).not.toThrow()
+  })
+
+  it('skips a too-short hidden value (no spurious substring match)', () => {
+    const fields = routeFields(
+      { task: 'agent-visible', key: 'grading-only' },
+      { task: 'Summarize the contract clause about indemnity.', key: 'A' },
+    )
+    expect(() => assertNoHiddenLeak(fields, 'A wholly innocent prompt.')).not.toThrow()
+  })
+})
+
+describe('hiddenGrade — honest pass-rate normalization', () => {
+  it('computes passRate = passed / total', () => {
+    expect(hiddenGrade(3, 4).passRate).toBeCloseTo(0.75)
+  })
+
+  it('returns 0 (honest no-run) when total is 0', () => {
+    const g = hiddenGrade(0, 0, 'criteria did not run')
+    expect(g.passRate).toBe(0)
+    expect(g.total).toBe(0)
+    expect(g.notes).toBe('criteria did not run')
+  })
+
+  it('never reports passed above total', () => {
+    expect(hiddenGrade(9, 4).passed).toBe(4)
+  })
+})
+
+describe('gradeOnHidden — firewall + domain grader wired', () => {
+  // The domain's OWN grader: check the brief artifact against the hidden
+  // required-holdings string. The substrate bakes in NO node/test/exec — this
+  // grader is pure string matching; a coding domain would run node --test here.
+  const legalGrader: HiddenCriteriaGrader<{ brief: string }, { mustCite: string[] }> = (
+    artifact,
+    hidden,
+  ) => {
+    const passed = hidden.mustCite.filter((c) => artifact.brief.includes(c)).length
+    return hiddenGrade(passed, hidden.mustCite.length)
+  }
+
+  const fields = routeFields(legalRouting, legalValues)
+  const agentContext = `${legalValues.question}\n${legalValues.sampleCitation}`
+
+  it('grades against the hidden criteria behind the firewall', async () => {
+    const result = await gradeOnHidden({
+      artifact: { brief: 'We rely on Carpenter v. United States and Katz.' },
+      hiddenCriteria: { mustCite: ['Carpenter v. United States', 'Katz'] },
+      grader: legalGrader,
+      firewall: { fields, agentContext },
+    })
+    expect(result.passRate).toBe(1)
+    expect(result.total).toBe(2)
+  })
+
+  it('throws BEFORE grading if the firewall is breached at grading time', async () => {
+    const leaky = `${agentContext}\n${legalValues.requiredHoldings}`
+    await expect(
+      gradeOnHidden({
+        artifact: { brief: 'irrelevant' },
+        hiddenCriteria: { mustCite: ['Carpenter v. United States'] },
+        grader: legalGrader,
+        firewall: { fields, agentContext: leaky },
+      }),
+    ).rejects.toBeInstanceOf(ValidationError)
+  })
+})
+
+describe('blendHeldout — composite weighting', () => {
+  it('composes with the default 0.7 / 0.3 weights', () => {
+    // 0.7 * 1.0 (perfect held-out) + 0.3 * 0.5 (mediocre judge) = 0.85
+    expect(blendHeldout(1, 0.5)).toBeCloseTo(0.85)
+    // 0.7 * 0.0 (failed held-out) + 0.3 * 1.0 (loved by judge) = 0.30 (capped low)
+    expect(blendHeldout(0, 1)).toBeCloseTo(0.3)
+    expect(defaultBlendWeights).toEqual({ heldout: 0.7, judge: 0.3 })
+  })
+
+  it('renormalizes arbitrary positive weight ratios', () => {
+    // 3:1 ratio == 0.75 / 0.25
+    expect(blendHeldout(1, 0, { heldout: 3, judge: 1 })).toBeCloseTo(0.75)
+    expect(blendHeldout(0, 1, { heldout: 1, judge: 1 })).toBeCloseTo(0.5)
+  })
+
+  it('clamps out-of-range inputs to [0,1]', () => {
+    expect(blendHeldout(2, -1)).toBeCloseTo(0.7) // 0.7*1 + 0.3*0
+  })
+
+  it('throws on a non-positive weight sum', () => {
+    expect(() => blendHeldout(1, 1, { heldout: 0, judge: 0 })).toThrow(ValidationError)
+  })
+})
+
+describe('withHeldoutBlend — judge composite becomes the blend', () => {
+  const baseScore = (_input: { artifact: { heldoutPassRate: number } }): JudgeScore => ({
+    dimensions: { quality: 0.5 },
+    composite: 0.5,
+    notes: 'style ok',
+  })
+
+  it('replaces the judge composite with the held-out-weighted blend', async () => {
+    const blended = withHeldoutBlend(baseScore, (a) => a.heldoutPassRate)
+    const out = await blended({ artifact: { heldoutPassRate: 1 } })
+    // 0.7 * 1.0 + 0.3 * 0.5 = 0.85
+    expect(out.composite).toBeCloseTo(0.85)
+    expect(out.dimensions).toEqual({ quality: 0.5 })
+    expect(out.notes).toMatch(/held-out 100%/)
+  })
+
+  it('passes a failed judge verdict through untouched', async () => {
+    const failing = (_i: { artifact: { heldoutPassRate: number } }): JudgeScore => ({
+      dimensions: {},
+      composite: 0,
+      notes: 'judge errored',
+      failed: true,
+    })
+    const blended = withHeldoutBlend(failing, (a) => a.heldoutPassRate)
+    const out = await blended({ artifact: { heldoutPassRate: 1 } })
+    expect(out.failed).toBe(true)
+    expect(out.composite).toBe(0)
+  })
+})
diff --git a/src/hidden-criteria-grading.ts b/src/hidden-criteria-grading.ts
new file mode 100644
index 0000000..6e4836f
--- /dev/null
+++ b/src/hidden-criteria-grading.ts
@@ -0,0 +1,329 @@
+/**
+ * Hidden-criteria grading firewall — grade an agent on criteria it never saw.
+ *
+ * A trustworthy benchmark splits every scenario's data by WHERE each field is
+ * allowed to flow, then proves the held-out / judge-only fields never reach the
+ * agent during the run. The coding bench expresses this with four destinations
+ * (prompt / develop-against / held-out suite / rubric); this module lifts the
+ * domain-FREE core out of it so research, legal, tax, content — any domain —
+ * can declare the same routing and get the same firewall enforcement and the
+ * same held-out-weighted composite, plugging in its OWN grader.
+ *
+ * Two reusable pieces, both domain-agnostic:
+ *
+ *   1. FIELD ROUTING BY DESTINATION. A scenario declares each field's
+ *      `FieldDestination`; `assertNoHiddenLeak` is a pure checker that throws if
+ *      a grading-only or judge-only field's value appears in what reaches the
+ *      agent. The domain decides which fields exist and where they go — the
+ *      substrate only enforces "hidden stays hidden".
+ *
+ *   2. HIDDEN-CRITERIA GRADING. The domain supplies its own grader
+ *      `(artifact, hiddenCriteria) => { passRate, total }` — the coding
+ *      node-test executor is ONE such grader a consumer plugs in; the substrate
+ *      bakes in NO node/test/TS/exec/regex. `gradeOnHidden` runs that grader
+ *      behind the firewall and `blendHeldout` composes its pass rate with a
+ *      judge score into the final number the leaderboard ranks on.
+ *
+ * Shape mirrors `treatment-gate`/`authenticity`: pure predicates and pure
+ * composition over already-computed values, fail-loud, with the
+ * "which field / which weight / which grader" decisions left as parameters and
+ * no domain literal anywhere in the module.
+ *
+ * Lives next to `test-graded-scenario` and `partition-held-out` — it is a
+ * scorecard/grading concept that makes sense without a running agent loop.
+ */
+
+import type { JudgeScore } from './campaign/types'
+import { ValidationError } from './errors'
+
+// ── 1. field routing by destination ──────────────────────────────────────────
+
+/**
+ * Where one scenario field is allowed to flow. The firewall guarantee is keyed
+ * on this tag, not on a field name — a domain can have any number of fields per
+ * destination.
+ *
+ *   - `agent-visible`   reaches the agent's context during the run (the prompt,
+ *                       the task statement — what the agent reads to act).
+ *   - `develop-against` seeded into the agent's environment during the run so it
+ *                       can iterate (a visible example/test/reference). The
+ *                       agent MAY read it — that is intentional (real TDD). Not
+ *                       a leak: it is example-grade, not the grading criteria.
+ *   - `grading-only`    the hidden criteria. Used ONLY at grading, after the run
+ *                       — the held-out suite / answer key / hidden requirements.
+ *                       Must NEVER reach the agent context. This is what makes a
+ *                       good score un-memorizable.
+ *   - `judge-only`      grading context for the judge only (rubric anchors,
+ *                       design intent). Lives with the judge, never in the agent
+ *                       context.
+ */
+export type FieldDestination = 'agent-visible' | 'develop-against' | 'grading-only' | 'judge-only'
+
+/** The destinations a value must be kept OUT of the agent context for. */
+const hiddenDestinations: ReadonlySet<FieldDestination> = new Set<FieldDestination>([
+  'grading-only',
+  'judge-only',
+])
+
+/** True for the destinations whose values must never reach the agent context. */
+export function isHiddenDestination(destination: FieldDestination): boolean {
+  return hiddenDestinations.has(destination)
+}
+
+/**
+ * A scenario's fields routed by destination. The domain owns the field set
+ * (`TFields` — a record of its named fields to their string-renderable values)
+ * and declares one `FieldDestination` per field. `routeFields` builds this from
+ * a domain's `(value, destination)` map; the firewall reads it.
+ */
+export interface RoutedField {
+  /** The field's name — for diagnostics only. */
+  name: string
+  /** The field's value as it would be rendered into text. The firewall compares
+   *  this against the agent context, so a domain that ships structured data
+   *  passes a stable string projection (e.g. JSON) of the hidden value. */
+  value: string
+  destination: FieldDestination
+}
+
+/**
+ * Route a domain's named fields by destination into the firewall's input shape.
+ * The `routing` declares each field's destination; the `values` carry each
+ * field's renderable string. A field present in `routing` but missing from
+ * `values` is an authoring error (fail loud) — every routed field must have a
+ * value the firewall can check.
+ */
+export function routeFields<TName extends string>(
+  routing: Readonly<Record<TName, FieldDestination>>,
+  values: Readonly<Record<TName, string>>,
+): RoutedField[] {
+  const out: RoutedField[] = []
+  for (const name of Object.keys(routing) as TName[]) {
+    const value = values[name]
+    if (value === undefined) {
+      throw new ValidationError(
+        `routed field "${name}" has a destination but no value — every routed field must carry its value`,
+      )
+    }
+    out.push({ name, value, destination: routing[name] })
+  }
+  return out
+}
+
+/** A single detected leak: a hidden field whose value appears in the agent context. */
+export interface HiddenLeak {
+  field: string
+  destination: FieldDestination
+}
+
+export interface NoLeakOptions {
+  /** Minimum hidden-value length to check. A hidden value shorter than this is
+   *  skipped — a one-word or empty hidden field would substring-match innocuous
+   *  prose and is not meaningful evidence of a leak. Default 12. */
+  minMatchLength?: number
+}
+
+/**
+ * The FIREWALL. Throws `ValidationError` if any `grading-only`/`judge-only`
+ * field's value is found inside `agentContext` — the exact text that reaches the
+ * agent during the run (its prompt, its seeded files concatenated, whatever the
+ * caller assembled). `agent-visible` and `develop-against` fields are never
+ * checked: they are meant to be there.
+ *
+ * Substring containment is the check: it is domain-free and catches the failure
+ * that matters — a hidden answer key, held-out case, or rubric anchor pasted
+ * into the prompt. Returns the routed fields on success so a caller can chain.
+ */
+export function assertNoHiddenLeak(
+  fields: readonly RoutedField[],
+  agentContext: string,
+  opts: NoLeakOptions = {},
+): readonly RoutedField[] {
+  const minLen = opts.minMatchLength ?? 12
+  const leaks: HiddenLeak[] = []
+  for (const field of fields) {
+    if (!isHiddenDestination(field.destination)) continue
+    const needle = field.value.trim()
+    if (needle.length < minLen) continue
+    if (agentContext.includes(needle)) {
+      leaks.push({ field: field.name, destination: field.destination })
+    }
+  }
+  if (leaks.length > 0) {
+    const detail = leaks.map((l) => `"${l.field}" (${l.destination})`).join(', ')
+    throw new ValidationError(
+      `hidden-criteria firewall breached: ${leaks.length} hidden field(s) reached the agent context: ${detail}`,
+    )
+  }
+  return fields
+}
+
+/** Collect the values a domain may safely render into the agent context — the
+ *  `agent-visible` (and, by intent, `develop-against`) fields — so a caller can
+ *  ASSEMBLE the context from the routing rather than hand-picking fields and
+ *  risking a slip. `develop-against` is included because it is seeded into the
+ *  agent's environment during the run on purpose. */
+export function agentVisibleFields(fields: readonly RoutedField[]): RoutedField[] {
+  return fields.filter((f) => !isHiddenDestination(f.destination))
+}
+
+// ── 2. hidden-criteria grading ────────────────────────────────────────────────
+
+/** What a hidden-criteria grader reports. `passRate = passed / total` over the
+ *  hidden checks; `total === 0` means the criteria never ran (e.g. the artifact
+ *  did not even load) — an honest zero, never a spurious pass. */
+export interface HiddenGradeResult {
+  /** Hidden checks that passed. */
+  passed: number
+  /** Total hidden checks attempted. 0 when the criteria could not run at all. */
+  total: number
+  /** `passed / total`, or 0 when `total === 0`. The PRIMARY correctness score. */
+  passRate: number
+  /** Free-form provenance the caller may record (runner output, reason for 0). */
+  notes?: string
+}
+
+/**
+ * The domain's grader: given the agent's artifact and the HIDDEN criteria,
+ * return a pass rate. This is the ONE seam a non-coding domain implements — the
+ * coding node-test executor is a single implementation of it; a legal grader
+ * checks the brief against hidden required holdings, a research grader checks an
+ * answer against held-out facts, a tax grader runs hidden return assertions.
+ * The substrate calls it ONLY at grading time, behind the firewall.
+ *
+ * `THidden` is the domain's hidden-criteria payload (the held-out suite, the
+ * answer key, the hidden requirements) — opaque to the substrate.
+ */
+export type HiddenCriteriaGrader<TArtifact, THidden> = (
+  artifact: TArtifact,
+  hiddenCriteria: THidden,
+  signal?: AbortSignal,
+) => Promise<HiddenGradeResult> | HiddenGradeResult
+
+/** Normalize a grader's raw `{passed, total}` into a `HiddenGradeResult` with a
+ *  consistent, fail-loud `passRate` — the canonical "honest zero on no-run"
+ *  rule, single-sourced so every domain grader gets it. */
+export function hiddenGrade(passed: number, total: number, notes?: string): HiddenGradeResult {
+  const p = Number.isFinite(passed) && passed > 0 ? Math.floor(passed) : 0
+  const t = Number.isFinite(total) && total > 0 ? Math.floor(total) : 0
+  const passRate = t > 0 ? Math.min(1, p / t) : 0
+  return { passed: Math.min(p, t > 0 ? t : p), total: t, passRate, notes }
+}
+
+/**
+ * Run a domain's hidden-criteria grader behind the firewall. Before grading, it
+ * re-asserts the firewall against the agent context the run actually used —
+ * proving (at grading time, on real data) that the hidden criteria never
+ * reached the agent — then invokes the grader and returns its pass rate. A
+ * domain that wants the firewall and the grader wired together in one call uses
+ * this; a domain that already asserted the firewall at dispatch time can call
+ * its grader directly and feed the result to `blendHeldout`.
+ */
+export async function gradeOnHidden<TArtifact, THidden>(args: {
+  artifact: TArtifact
+  hiddenCriteria: THidden
+  grader: HiddenCriteriaGrader<TArtifact, THidden>
+  /** The routed fields + the exact agent context, re-checked before grading. */
+  firewall: { fields: readonly RoutedField[]; agentContext: string; options?: NoLeakOptions }
+  signal?: AbortSignal
+}): Promise<HiddenGradeResult> {
+  assertNoHiddenLeak(args.firewall.fields, args.firewall.agentContext, args.firewall.options)
+  const result = await args.grader(args.artifact, args.hiddenCriteria, args.signal)
+  return hiddenGrade(result.passed, result.total, result.notes)
+}
+
+// ── the composite: hidden correctness (PRIMARY) + judge quality (secondary) ────
+
+/** Weights for the held-out / judge blend. Must be finite and non-negative;
+ *  they are renormalized to sum to 1 so a caller can pass any positive ratio. */
+export interface BlendWeights {
+  /** Weight on the hidden-criteria pass rate (the primary, ungameable score). */
+  heldout: number
+  /** Weight on the judge's quality composite (the secondary style/quality score). */
+  judge: number
+}
+
+/** Default blend: 0.7 hidden correctness, 0.3 judge quality. The coding bench's
+ *  long-standing split — execution truth dominates, style refines. */
+export const defaultBlendWeights: BlendWeights = { heldout: 0.7, judge: 0.3 }
+
+/** The input shape a judge's `score` receives — exactly `JudgeConfig.score`'s
+ *  argument: the artifact, plus any scenario/signal fields the judge carries.
+ *  `withHeldoutBlend` only reads `artifact`; the rest rides through. */
+export interface JudgeScoreInput<TArtifact> {
+  artifact: TArtifact
+  /** Pass-through for the judge's extra input fields (scenario, signal). */
+  [key: string]: unknown
+}
+
+function normalizeWeights(weights: BlendWeights): { heldout: number; judge: number } {
+  const h = Number.isFinite(weights.heldout) && weights.heldout >= 0 ? weights.heldout : 0
+  const j = Number.isFinite(weights.judge) && weights.judge >= 0 ? weights.judge : 0
+  const sum = h + j
+  if (sum <= 0) {
+    throw new ValidationError(
+      'blend weights must have a positive sum (got heldout+judge <= 0) — cannot weight a composite by zero',
+    )
+  }
+  return { heldout: h / sum, judge: j / sum }
+}
+
+/**
+ * Compose the PRIMARY hidden-criteria pass rate with the SECONDARY judge
+ * composite into the single score the leaderboard ranks on. Weights are
+ * renormalized, so a solution that fails the hidden criteria is capped low no
+ * matter how the judge felt about its style, while a stylistically-mediocre but
+ * CORRECT solution still earns the bulk of the points. Both inputs are clamped
+ * to [0,1] — a judge on a non-unit scale must be normalized by the caller first.
+ */
+export function blendHeldout(
+  heldoutPassRate: number,
+  judgeScore: number,
+  weights: BlendWeights = defaultBlendWeights,
+): number {
+  const w = normalizeWeights(weights)
+  const heldout = clampUnit(heldoutPassRate)
+  const judge = clampUnit(judgeScore)
+  return w.heldout * heldout + w.judge * judge
+}
+
+/**
+ * Wrap a judge's `score` so the `composite` it REPORTS is the held-out-weighted
+ * blend. The judge still scores its quality dimensions (recorded, secondary),
+ * but the composite that downstream selection/scorecard reads becomes
+ * `blendHeldout(heldoutPassRate(artifact), judgeComposite, weights)`. The held-
+ * out pass rate is read off the artifact via `heldoutPassRate` — already
+ * computed before the judge runs — so no second grading pass is needed.
+ *
+ * Generic over the artifact type, inferred from `heldoutPassRate`, so it
+ * composes with both a `campaign` `JudgeConfig.score` and a bare scoring
+ * function. The input is the judge's `{ artifact, ... }` — any extra fields
+ * (`scenario`, `signal`) ride through untouched via the index signature.
+ */
+export function withHeldoutBlend<TArtifact>(
+  score: (input: JudgeScoreInput<TArtifact>) => JudgeScore | Promise<JudgeScore>,
+  heldoutPassRate: (artifact: TArtifact) => number,
+  weights: BlendWeights = defaultBlendWeights,
+): (input: JudgeScoreInput<TArtifact>) => Promise<JudgeScore> {
+  return async (input: JudgeScoreInput<TArtifact>): Promise<JudgeScore> => {
+    const base = await score(input)
+    if (base.failed) return base
+    const rate = clampUnit(heldoutPassRate(input.artifact))
+    const composite = blendHeldout(rate, base.composite, weights)
+    const w = normalizeWeights(weights)
+    return {
+      ...base,
+      composite,
+      notes:
+        `composite=${composite.toFixed(3)} ` +
+        `(held-out ${(rate * 100).toFixed(0)}% × ${w.heldout.toFixed(2)} + ` +
+        `quality ${base.composite.toFixed(3)} × ${w.judge.toFixed(2)})` +
+        (base.notes ? ` — ${base.notes}` : ''),
+    }
+  }
+}
+
+function clampUnit(value: number): number {
+  if (!Number.isFinite(value)) return 0
+  return Math.max(0, Math.min(1, value))
+}
diff --git a/src/index.ts b/src/index.ts
index 2a84b8b..0e1402f 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -609,6 +609,27 @@ export {
   DEFAULT_RULES as DEFAULT_FAILURE_RULES,
   FAILURE_CLASSES,
 } from './failure-taxonomy'
+export type {
+  BlendWeights,
+  FieldDestination,
+  HiddenCriteriaGrader,
+  HiddenGradeResult,
+  HiddenLeak,
+  JudgeScoreInput,
+  NoLeakOptions,
+  RoutedField,
+} from './hidden-criteria-grading'
+export {
+  agentVisibleFields,
+  assertNoHiddenLeak,
+  blendHeldout,
+  defaultBlendWeights,
+  gradeOnHidden,
+  hiddenGrade,
+  isHiddenDestination,
+  routeFields,
+  withHeldoutBlend,
+} from './hidden-criteria-grading'
 export type {
   ProjectRuntimeTrajectoryEvidenceOptions,
   RuntimeTrajectoryEvidenceProjection,