Skip to content

Commit 54643ba

Browse files
committed
things in a better shape
1 parent 313a523 commit 54643ba

8 files changed

Lines changed: 166 additions & 9 deletions

File tree

.github/workflows/benchmark-reusable.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,18 @@ jobs:
140140
exit 1
141141
fi
142142
143+
- name: Log benchmark summary
144+
if: always()
145+
run: |
146+
set -euo pipefail
147+
if [ -f benchmark.json ]; then
148+
echo "=== Benchmark Summary ==="
149+
jq -r '.summary // "No summary available"' benchmark.json
150+
echo "========================"
151+
else
152+
echo "benchmark.json not found, skipping summary log"
153+
fi
154+
143155
- name: Prepare artifact name
144156
id: artifact
145157
env:

agents/claude-code.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ const claudeCodeAgent: AgentDefinition = {
106106
const cacheKey = sessionKey(cwd, model);
107107
const existingSessionID = sessionCache.get(cacheKey);
108108

109+
const actions: string[] = [];
109110
const usage = {
110111
input: 0,
111112
output: 0,
@@ -128,12 +129,13 @@ const claudeCodeAgent: AgentDefinition = {
128129
// Extract and cache session ID from messages
129130
sessionCache.set(cacheKey, message.session_id);
130131

131-
// Accumulate token usage if available
132-
if (message.usage) {
132+
// Accumulate token usage if available (only SDKResultMessage has usage)
133+
if (message.type === "result" && "usage" in message) {
133134
usage.input += message.usage.input_tokens || 0;
134135
usage.output += message.usage.output_tokens || 0;
135136
}
136137

138+
actions.push(JSON.stringify(message));
137139
logJson(message, options);
138140
}
139141
} catch (error) {
@@ -149,7 +151,7 @@ const claudeCodeAgent: AgentDefinition = {
149151
throw error;
150152
}
151153

152-
return { command: displayCommand, usage };
154+
return { command: displayCommand, actions, usage };
153155
},
154156
};
155157

agents/opencode.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ const opencodeAgent: AgentDefinition = {
170170
sessionCache.set(cacheKey, sessionID);
171171
}
172172

173+
const actions: string[] = [];
173174
const usage = {
174175
input: 0,
175176
output: 0,
@@ -192,6 +193,11 @@ const opencodeAgent: AgentDefinition = {
192193
usage.input = data.info.tokens.input;
193194
usage.output = data.info.tokens.output;
194195

196+
actions.push(JSON.stringify(data.info));
197+
if (Array.isArray(data.parts)) {
198+
data.parts.forEach((part) => actions.push(JSON.stringify(part)));
199+
}
200+
195201
logPromptResult(data, options);
196202
} catch (error) {
197203
sessionCache.delete(cacheKey);
@@ -205,7 +211,7 @@ const opencodeAgent: AgentDefinition = {
205211
throw error;
206212
}
207213

208-
return { command: displayCommand, usage };
214+
return { command: displayCommand, actions, usage };
209215
},
210216
};
211217

cli.ts

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ import { listScores, scores as scoreRegistry } from "~/scores/index.js";
1616
import { dataset } from "~/lib/dataset.js";
1717
import type { DatasetEval, ScoreAssignment } from "~/lib/dataset.js";
1818
import { generatePlannerTasks, type PlannerTask } from "~/lib/planner.js";
19+
import {
20+
generateActionsSummary,
21+
type EpisodeActions,
22+
} from "~/lib/summarizer.js";
1923
import { fetchPlannerCommitDiffs } from "~/lib/github.js";
2024
import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";
2125
import { judges, getJudgeModelId } from "~/judges.js";
@@ -31,7 +35,6 @@ import type {
3135
import type {
3236
Episode,
3337
EvaluationRunExport,
34-
TokenUsage,
3538
Usage,
3639
} from "~/types/export.js";
3740
import { withRetries, withTimeout } from "~/lib/utils/retry.js";
@@ -257,6 +260,7 @@ async function main(): Promise<void> {
257260
aggregationSummary: AggregationSummary;
258261
scoreExports: EvaluationRunExport["scores"];
259262
logs: string[];
263+
actions: string[];
260264
usage: Usage;
261265
}
262266

@@ -307,6 +311,7 @@ async function main(): Promise<void> {
307311
let tasksExecuted = 0;
308312

309313
let usage: Usage = { input: 0, output: 0 };
314+
const episodeActions: string[] = [];
310315

311316
for (const task of plannerTasks) {
312317
const logPrefix = `${prefix} ${task.commit}`;
@@ -349,6 +354,9 @@ async function main(): Promise<void> {
349354

350355
usage.input += result.usage.input;
351356
usage.output += result.usage.output;
357+
358+
// Collect actions from this task
359+
episodeActions.push(...result.actions);
352360
} catch (error) {
353361
const message =
354362
error instanceof Error ? error.message : String(error);
@@ -405,6 +413,7 @@ async function main(): Promise<void> {
405413
aggregationSummary,
406414
scoreExports: episodeScoreExports,
407415
logs: [],
416+
actions: episodeActions,
408417
usage,
409418
};
410419
} finally {
@@ -450,6 +459,7 @@ async function main(): Promise<void> {
450459
const aggregatedInputs = new Map<string, ScoreAggregationInput>();
451460
const episodeExports: Episode[] = [];
452461
const allLogs: string[] = [];
462+
const episodesActions: EpisodeActions[] = [];
453463
const averageUsage = episodeResults.reduce(
454464
(prev, { usage }) => ({
455465
input: prev.input + usage.input / episodeResults.length,
@@ -472,6 +482,28 @@ async function main(): Promise<void> {
472482
if (result.logs && result.logs.length > 0) {
473483
allLogs.push(...result.logs);
474484
}
485+
486+
// Collect actions for summarization
487+
episodesActions.push({
488+
episodeIndex: result.index,
489+
actions: result.actions,
490+
});
491+
}
492+
493+
// Generate summary from all episodes' actions
494+
let summary = "";
495+
try {
496+
summary = await generateActionsSummary(
497+
evalDefinition,
498+
model,
499+
episodesActions,
500+
);
501+
} catch (error) {
502+
const message = error instanceof Error ? error.message : String(error);
503+
console.error(
504+
`[${combinationLabel}] Failed to generate summary: ${message}`,
505+
);
506+
summary = ""; // Keep empty string on failure
475507
}
476508

477509
return summarizeAggregation(
@@ -482,7 +514,6 @@ async function main(): Promise<void> {
482514
aggregatedInputs,
483515
episodeExports,
484516
averageUsage,
485-
"",
486517
summary,
487518
);
488519
};
@@ -497,7 +528,7 @@ async function main(): Promise<void> {
497528
agentName,
498529
);
499530

500-
evaluationResult.summaries.forEach((line) => {
531+
evaluationResult.lines.forEach((line) => {
501532
console.log(line);
502533
});
503534

index.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
export { getAgent, listAgents } from "~/agents/index.js";
22
export { scores, getScore, listScores } from "~/scores/index.js";
33
export type { AgentDefinition, AgentExecutor, AgentPrompt } from "~/lib/createAgent.js";
4-
export { createAgent } from "~/lib/createAgent.js";
54
export { createScore } from "~/lib/createScore.js";
65
export type {
76
ScoreDefinition,

lib/createAgent.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export interface AgentDefinition {
2424

2525
export interface AgentRunResult {
2626
command: string;
27-
messages: string[];
27+
actions: string[];
2828
usage: {
2929
input: number;
3030
output: number;

lib/summarizer.ts

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import { generateText } from "ai";
2+
import { z } from "zod";
3+
4+
import type { DatasetEval } from "~/lib/dataset.js";
5+
import { getZenLanguageModel } from "~/lib/zenModels.js";
6+
7+
const fallback = (envName: string, defaultValue: string): string =>
8+
process.env[envName]?.trim() || defaultValue;
9+
10+
export interface EpisodeActions {
11+
episodeIndex: number;
12+
actions: string[];
13+
}
14+
15+
const systemPrompt = `You are a technical summarizer that creates concise, informative summaries of autonomous agent activities across multiple evaluation episodes.
16+
17+
Your task:
18+
- Analyze the actions taken by an AI agent across 3 separate episodes of the same task
19+
- Identify common patterns, tools used, files modified, and key behaviors
20+
- Produce a clear, structured summary that highlights what the agent did
21+
22+
Focus on:
23+
- **Tool usage patterns**: Which tools were used most frequently
24+
- **File modifications**: Which files were created, edited, or read
25+
- **Common strategies**: What approach did the agent consistently take
26+
- **Consistency**: Did the agent behave similarly across episodes, or vary significantly?
27+
- **Outcomes**: Any errors, successes, or notable behaviors
28+
29+
Output format:
30+
Write 2-4 paragraphs in a professional, technical style. Be concise but informative.
31+
32+
Structure:
33+
1. **Overview**: Brief description of what the agent accomplished
34+
2. **Approach**: Tools and strategies used consistently across episodes
35+
3. **Key actions**: Specific files modified or critical operations performed
36+
4. **Observations**: Any notable patterns, inconsistencies, or issues
37+
38+
Guidelines:
39+
- Keep it under 300 words
40+
- Use technical language but be clear
41+
- Focus on patterns across episodes, not individual actions
42+
- Mention specific tool names and file paths when relevant
43+
- Note any errors or issues encountered
44+
- Be objective and descriptive, not evaluative`;
45+
46+
const summarizerModelId = fallback(
47+
"SUMMARIZER_MODEL",
48+
"opencode/claude-sonnet-4-5",
49+
);
50+
51+
export async function generateActionsSummary(
52+
evaluation: DatasetEval,
53+
model: string,
54+
episodesActions: EpisodeActions[],
55+
): Promise<string> {
56+
if (episodesActions.length === 0) {
57+
return "No actions recorded";
58+
}
59+
60+
// Build a structured prompt with the actions data
61+
const episodesSummary = episodesActions
62+
.map((ep) => {
63+
const sample = ep.actions.slice(0, 50); // First 50 actions per episode
64+
const truncated =
65+
ep.actions.length > 50
66+
? `\n... (${ep.actions.length - 50} more actions)`
67+
: "";
68+
69+
return `### Episode ${ep.episodeIndex}
70+
Actions (${ep.actions.length} total):
71+
${sample.join("\n")}${truncated}`;
72+
})
73+
.join("\n\n");
74+
75+
const prompt = `Repository: ${evaluation.repo}
76+
Model: ${model}
77+
Task: Implement changes from ${evaluation.from.slice(0, 7)} to ${evaluation.to.slice(0, 7)}
78+
79+
${episodesSummary}
80+
81+
Provide a concise summary of what the agent did across these episodes.`;
82+
83+
try {
84+
const result = await generateText({
85+
model: getZenLanguageModel(summarizerModelId),
86+
system: systemPrompt,
87+
temperature: 0.3,
88+
prompt,
89+
});
90+
91+
return result.text.trim();
92+
} catch (error) {
93+
const message = error instanceof Error ? error.message : String(error);
94+
console.error(`[summarizer] Failed to generate summary: ${message}`);
95+
return `Unable to generate summary: ${message}`;
96+
}
97+
}

scripts/discord-sample.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,18 +126,21 @@ const claudeEpisodes: Episode[] = [
126126
baseScore: 0.912,
127127
variancePenalty: 0.003,
128128
scores: cloneScores(claudeScores, 0.002, -0.005),
129+
usage: { input: 50000, output: 10000 },
129130
},
130131
{
131132
finalScore: 0.901,
132133
baseScore: 0.905,
133134
variancePenalty: 0.004,
134135
scores: cloneScores(claudeScores, 0, 0),
136+
usage: { input: 51000, output: 10500 },
135137
},
136138
{
137139
finalScore: 0.896,
138140
baseScore: 0.902,
139141
variancePenalty: 0.006,
140142
scores: cloneScores(claudeScores, -0.002, 0.004),
143+
usage: { input: 49000, output: 9800 },
141144
},
142145
];
143146

@@ -183,18 +186,21 @@ const gptEpisodes: Episode[] = [
183186
baseScore: 0.907,
184187
variancePenalty: 0.004,
185188
scores: cloneScores(gptScores, 0.003, -0.006),
189+
usage: { input: 48000, output: 9500 },
186190
},
187191
{
188192
finalScore: 0.894,
189193
baseScore: 0.898,
190194
variancePenalty: 0.004,
191195
scores: cloneScores(gptScores, -0.002, 0.002),
196+
usage: { input: 49000, output: 9800 },
192197
},
193198
{
194199
finalScore: 0.892,
195200
baseScore: 0.897,
196201
variancePenalty: 0.005,
197202
scores: cloneScores(gptScores, -0.003, 0.004),
203+
usage: { input: 47500, output: 9400 },
198204
},
199205
];
200206

@@ -214,6 +220,8 @@ const sampleExport: EvaluationRunExport[] = [
214220
variancePenalty: 0.003,
215221
scores: claudeScores,
216222
episodes: claudeEpisodes,
223+
usage: { input: 50000, output: 10100 },
224+
summary: "",
217225
},
218226
{
219227
agent: "opencode",
@@ -230,6 +238,8 @@ const sampleExport: EvaluationRunExport[] = [
230238
variancePenalty: 0.004,
231239
scores: gptScores,
232240
episodes: gptEpisodes,
241+
usage: { input: 48167, output: 9633 },
242+
summary: "",
233243
},
234244
];
235245

0 commit comments

Comments
 (0)