Skip to content

Commit 605f704

Browse files
committed
wip
1 parent 1394e2d commit 605f704

File tree

3 files changed

+250
-467
lines changed

3 files changed

+250
-467
lines changed

.github/workflows/benchmark-reusable.yml

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,30 @@ permissions:
1717
actions: read
1818

1919
jobs:
20+
prepare-analysis:
21+
name: Prepare Judge Analysis Matrix
22+
runs-on: ubuntu-latest
23+
outputs:
24+
evals: ${{ steps.compute.outputs.evals }}
25+
steps:
26+
- name: Extract unique evaluations
27+
id: compute
28+
run: |
29+
set -euo pipefail
30+
matrix_json=$(cat <<'JSON'
31+
${{ inputs.matrix }}
32+
JSON
33+
)
34+
evals=$(jq -c 'map(.eval) | unique' <<<"$matrix_json")
35+
36+
if [ -z "${evals}" ] || [ "${evals}" = "null" ]; then
37+
echo "No evaluations found in matrix definition." >&2
38+
evals="[]"
39+
fi
40+
41+
echo "Analysis eval matrix: ${evals}"
42+
echo "evals=${evals}" >> "$GITHUB_OUTPUT"
43+
2044
benchmark:
2145
name: Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
2246
runs-on: ubuntu-latest
@@ -148,10 +172,8 @@ jobs:
148172
echo "benchmark.json not found, skipping summary log"
149173
fi
150174
151-
- name: Generate Judges Summary for this Evaluation
175+
- name: Run benchmark-level judge analysis
152176
env:
153-
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
154-
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
155177
BENCHMARK_EVAL: ${{ matrix.eval }}
156178
run: |
157179
set -euo pipefail
@@ -161,7 +183,7 @@ jobs:
161183
echo "Evaluation: ${BENCHMARK_EVAL}"
162184
echo "═══════════════════════════════════════════════════════"
163185
echo ""
164-
bun run scripts/judges-summary.ts benchmark.json --ai-summary || true
186+
bun run scripts/analysis.ts benchmark.json || true
165187
echo ""
166188
167189
- name: Prepare artifact name
@@ -183,6 +205,55 @@ jobs:
183205
name: ${{ steps.artifact.outputs.name }}
184206
path: benchmark.json
185207

208+
eval-analysis:
209+
name: Judge Analysis - ${{ matrix.eval }}
210+
runs-on: ubuntu-latest
211+
needs:
212+
- benchmark
213+
- prepare-analysis
214+
if: needs.prepare-analysis.outputs.evals != '[]'
215+
environment: production
216+
strategy:
217+
fail-fast: false
218+
matrix:
219+
eval: ${{ fromJSON(needs.prepare-analysis.outputs.evals) }}
220+
steps:
221+
- name: Checkout repository
222+
uses: actions/checkout@v4
223+
224+
- name: Setup Bun
225+
uses: oven-sh/setup-bun@v1
226+
with:
227+
bun-version: 1.2.21
228+
229+
- name: Install dependencies
230+
run: bun install --frozen-lockfile
231+
232+
- name: Download benchmark artifacts for eval
233+
uses: actions/download-artifact@v4
234+
with:
235+
path: eval-benchmarks
236+
pattern: benchmark-*-*-${{ replace(matrix.eval, '/', '-') }}
237+
238+
- name: Merge benchmark exports
239+
id: merge
240+
run: |
241+
set -euo pipefail
242+
243+
if [ ! -d eval-benchmarks ] || ! find eval-benchmarks -type f -name '*.json' -print -quit | grep -q .; then
244+
echo "No benchmark artifacts found for eval ${{ matrix.eval }}; skipping analysis."
245+
echo "has_data=false" >> "$GITHUB_OUTPUT"
246+
exit 0
247+
fi
248+
249+
bun run scripts/merge-benchmark-exports.ts eval-benchmarks merged-benchmark.json
250+
echo "Merged benchmark export ready for analysis."
251+
echo "has_data=true" >> "$GITHUB_OUTPUT"
252+
253+
- name: Run judges analysis
254+
if: steps.merge.outputs.has_data == 'true'
255+
run: bun run scripts/analysis.ts merged-benchmark.json
256+
186257
notify:
187258
runs-on: ubuntu-latest
188259
needs: benchmark

scripts/analysis.ts

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#!/usr/bin/env bun
2+
/**
3+
* Agent Analysis Script
4+
*
5+
* Compares agent/model performance for one or more benchmark exports.
6+
* Accepts either a single benchmark run or a merged export containing
7+
* multiple runs of the same evaluation across different agents/models.
8+
*
9+
* Usage:
10+
* bun run scripts/analysis.ts path/to/benchmark.json
11+
* bun run scripts/analysis.ts path/to/merged.json
12+
*/
13+
14+
import { readFileSync } from "node:fs";
15+
import process from "node:process";
16+
import { generateText } from "ai";
17+
import type { EvaluationRunExport } from "~/types/export.js";
18+
import { getZenLanguageModel } from "~/lib/zenModels.js";
19+
20+
export const AGENT_ANALYSIS_PROMPT = `You are an expert analyst reviewing how different agents and models perform on the same benchmark evaluation.
21+
22+
Your task is to analyze the benchmark data and identify:
23+
1. **Systematic patterns**: Are certain agents or models consistently leading or lagging?
24+
2. **Performance gaps**: Where are the largest score deltas, and what might explain them?
25+
3. **Agent tendencies**: Do some runs prioritize certain behaviors (e.g., safety, completeness) based on their summaries?
26+
4. **Notable insights**: Highlight interesting contrasts between the strongest and weakest runs.
27+
5. **Recommendations**: Suggest concrete adjustments or experiments to improve future runs.
28+
29+
Focus on concrete observations from the data provided. Look for patterns such as:
30+
- Consistent scoring differences between specific agents or models
31+
- Summaries that reveal different optimization strategies or failure modes
32+
- Runs that score well overall but exhibit weaknesses in their own write-ups
33+
34+
Provide a concise, insightful analysis that helps developers understand agent behavior and improve the evaluation system.`;
35+
36+
const fallback = (envName: string, defaultValue: string): string =>
37+
process.env[envName]?.trim() || defaultValue;
38+
39+
const analyzerModelId = fallback(
40+
"ANALYZER_MODEL",
41+
"opencode/claude-sonnet-4-5",
42+
);
43+
44+
function buildDynamicContext(runs: EvaluationRunExport[]): string {
45+
const repo = runs[0].evaluation.repo;
46+
const parts: string[] = [];
47+
48+
parts.push(`# Evaluation
49+
- Repository: ${repo}
50+
- Total runs: ${runs.length}
51+
`);
52+
53+
parts.push("# Run Scoreboard");
54+
runs.forEach((run, index) => {
55+
parts.push(
56+
`${index + 1}. ${run.agent} (${run.model}) — final ${run.finalScore.toFixed(3)}, base ${run.baseScore.toFixed(
57+
3,
58+
)}, penalty ${run.variancePenalty.toFixed(3)}`,
59+
);
60+
if (run.summary?.trim()) {
61+
parts.push(` Summary: ${run.summary.trim()}`);
62+
}
63+
});
64+
parts.push("");
65+
66+
return parts.join("\n");
67+
}
68+
69+
function formatFallbackSummary(runs: EvaluationRunExport[]): string {
70+
const repo = runs[0].evaluation.repo;
71+
const lines: string[] = [];
72+
73+
lines.push(`Evaluation: ${repo}`);
74+
lines.push("");
75+
lines.push("Runs (best to worst):");
76+
77+
runs.forEach((run, index) => {
78+
lines.push(
79+
`${index + 1}. ${run.agent} (${run.model}) – final ${run.finalScore.toFixed(
80+
3,
81+
)}, base ${run.baseScore.toFixed(3)}, penalty ${run.variancePenalty.toFixed(
82+
3,
83+
)}`,
84+
);
85+
86+
if (run.summary?.trim()) {
87+
lines.push(` summary: ${run.summary.replace(/\s+/g, " ").trim()}`);
88+
}
89+
});
90+
91+
if (runs.length > 1) {
92+
const leader = runs[0];
93+
const trailer = runs[runs.length - 1];
94+
const gap = leader.finalScore - trailer.finalScore;
95+
lines.push("");
96+
lines.push(`Score gap (top vs bottom): ${gap.toFixed(3)}`);
97+
}
98+
99+
return lines.join("\n").trimEnd();
100+
}
101+
102+
async function generateAnalysis(
103+
runs: EvaluationRunExport[],
104+
): Promise<string> {
105+
const context = buildDynamicContext(runs);
106+
107+
try {
108+
const { text } = await generateText({
109+
model: getZenLanguageModel(analyzerModelId),
110+
system: AGENT_ANALYSIS_PROMPT,
111+
prompt: context,
112+
temperature: 0.3,
113+
});
114+
return text.trim();
115+
} catch (error) {
116+
const message = error instanceof Error ? error.message : String(error);
117+
return `Failed to generate AI analysis (${message}).\n\n${formatFallbackSummary(
118+
runs,
119+
)}`;
120+
}
121+
}
122+
123+
function usage(): void {
124+
console.error("Usage: bun run scripts/analysis.ts <benchmark-file.json>");
125+
console.error("");
126+
console.error(
127+
"Generates an AI-powered comparison of agent/model performance.",
128+
);
129+
}
130+
131+
async function main(): Promise<void> {
132+
const args = process.argv.slice(2);
133+
134+
if (args.length === 0) {
135+
usage();
136+
process.exit(1);
137+
}
138+
139+
const filePath = args[0];
140+
let parsed: unknown;
141+
142+
try {
143+
const fileContent = readFileSync(filePath, "utf-8");
144+
parsed = JSON.parse(fileContent) as unknown;
145+
} catch (error) {
146+
console.error(`Error reading file ${filePath}:`, error);
147+
process.exit(1);
148+
}
149+
150+
const runs = (Array.isArray(parsed)
151+
? (parsed as EvaluationRunExport[])
152+
: [parsed as EvaluationRunExport]).filter(
153+
(run): run is EvaluationRunExport =>
154+
run != null && typeof run === "object" && "finalScore" in run,
155+
);
156+
157+
if (runs.length === 0) {
158+
console.error("No evaluation runs found in the provided file.");
159+
process.exit(1);
160+
}
161+
162+
const orderedRuns = [...runs].sort(
163+
(a, b) => b.finalScore - a.finalScore,
164+
);
165+
166+
const output = await generateAnalysis(orderedRuns);
167+
process.stdout.write(`${output.trimEnd()}\n`);
168+
}
169+
170+
if (import.meta.main) {
171+
main().catch((error) => {
172+
console.error("Error:", error);
173+
process.exit(1);
174+
});
175+
}

0 commit comments

Comments
 (0)