Skip to content

Commit 3166156

Browse files
feat: add cost and overall summary to evals (#8)
* feat: add cost and overall summary to evals * feat: add cost and overall summary to evals * wip * things in a better shape * minor changes --------- Co-authored-by: Mohammad Bagher Abiyat <37929992+Aslemammad@users.noreply.github.com>
1 parent a69a3fa commit 3166156

10 files changed

Lines changed: 295 additions & 196 deletions

File tree

.github/workflows/benchmark-reusable.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,18 @@ jobs:
140140
exit 1
141141
fi
142142
143+
- name: Log benchmark summary
144+
if: always()
145+
run: |
146+
set -euo pipefail
147+
if [ -f benchmark.json ]; then
148+
echo "=== Benchmark Summary ==="
149+
jq -r '.summary // "No summary available"' benchmark.json
150+
echo "========================"
151+
else
152+
echo "benchmark.json not found, skipping summary log"
153+
fi
154+
143155
- name: Prepare artifact name
144156
id: artifact
145157
env:

agents/claude-code.ts

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ const claudeCodeAgent: AgentDefinition = {
106106
const cacheKey = sessionKey(cwd, model);
107107
const existingSessionID = sessionCache.get(cacheKey);
108108

109+
const actions: string[] = [];
110+
const usage = {
111+
input: 0,
112+
output: 0,
113+
};
114+
109115
try {
110116
const result = query({
111117
prompt,
@@ -122,6 +128,14 @@ const claudeCodeAgent: AgentDefinition = {
122128
for await (const message of result) {
123129
// Extract and cache session ID from messages
124130
sessionCache.set(cacheKey, message.session_id);
131+
132+
// Accumulate token usage if available (only SDKResultMessage has usage)
133+
if (message.type === "result" && "usage" in message) {
134+
usage.input += message.usage.input_tokens || 0;
135+
usage.output += message.usage.output_tokens || 0;
136+
}
137+
138+
actions.push(JSON.stringify(message));
125139
logJson(message, options);
126140
}
127141
} catch (error) {
@@ -137,7 +151,7 @@ const claudeCodeAgent: AgentDefinition = {
137151
throw error;
138152
}
139153

140-
return { command: displayCommand };
154+
return { command: displayCommand, actions, usage };
141155
},
142156
};
143157

agents/codex.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import process from "node:process";
33

44
import {
55
Codex,
6+
Usage,
67
type CommandExecutionItem,
78
type SandboxMode,
89
type Thread,
@@ -122,15 +123,27 @@ const codexAgent: AgentDefinition = {
122123
const key = sessionKey(model, cwd);
123124
const thread = getOrCreateThread(model, cwd);
124125

126+
const actions: string[] = [];
127+
let usage: Usage;
125128
try {
126129
const turn = await thread.run(prompt);
130+
assert(turn.usage, "The agent did not emit the usage information.");
131+
usage = turn.usage;
132+
actions.push(...turn.items.map((item) => JSON.stringify(item)));
127133
logTurnItems(turn.items, options);
128134
} catch (error) {
129135
threadCache.delete(key);
130136
throw error;
131137
}
132138

133-
return { command: displayCommand };
139+
return {
140+
command: displayCommand,
141+
actions,
142+
usage: {
143+
input: usage.input_tokens,
144+
output: usage.output_tokens,
145+
},
146+
};
134147
},
135148
};
136149

agents/opencode.ts

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,33 +73,36 @@ function writeLog(
7373

7474
function logJson(value: unknown, options: AgentRunOptions | undefined): void {
7575
try {
76-
writeLog(process.stdout, JSON.stringify(value), options?.logPrefix);
76+
const message = JSON.stringify(value);
77+
writeLog(process.stdout, message, options?.logPrefix);
7778
} catch (error) {
7879
const reason = error instanceof Error ? error.message : String(error);
79-
writeLog(
80-
process.stdout,
81-
JSON.stringify({ error: "serialization_failed", reason }),
82-
options?.logPrefix,
83-
);
80+
const errorMessage = JSON.stringify({
81+
error: "serialization_failed",
82+
reason,
83+
});
84+
writeLog(process.stdout, errorMessage, options?.logPrefix);
8485
}
8586
}
8687

8788
function logError(value: unknown, options: AgentRunOptions | undefined): void {
8889
try {
89-
writeLog(process.stderr, JSON.stringify(value), options?.logPrefix);
90+
const message = JSON.stringify(value);
91+
writeLog(process.stderr, message, options?.logPrefix);
9092
} catch (error) {
9193
const reason = error instanceof Error ? error.message : String(error);
92-
writeLog(
93-
process.stderr,
94-
JSON.stringify({ error: "serialization_failed", reason }),
95-
options?.logPrefix,
96-
);
94+
const errorMessage = JSON.stringify({
95+
error: "serialization_failed",
96+
reason,
97+
});
98+
writeLog(process.stderr, errorMessage, options?.logPrefix);
9799
}
98100
}
99101

100102
function logPromptResult(
101103
result: { info: AssistantMessage; parts: Part[] },
102104
options: AgentRunOptions | undefined,
105+
logs?: string[],
103106
): void {
104107
logJson({ info: result.info }, options);
105108
if (Array.isArray(result.parts)) {
@@ -167,6 +170,11 @@ const opencodeAgent: AgentDefinition = {
167170
sessionCache.set(cacheKey, sessionID);
168171
}
169172

173+
const actions: string[] = [];
174+
const usage = {
175+
input: 0,
176+
output: 0,
177+
};
170178
try {
171179
const [providerID, modelID] = model.split("/");
172180
const { data } = await opencode.client.session.prompt({
@@ -182,6 +190,14 @@ const opencodeAgent: AgentDefinition = {
182190
throwOnError: true,
183191
});
184192

193+
usage.input = data.info.tokens.input;
194+
usage.output = data.info.tokens.output;
195+
196+
actions.push(JSON.stringify(data.info));
197+
if (Array.isArray(data.parts)) {
198+
data.parts.forEach((part) => actions.push(JSON.stringify(part)));
199+
}
200+
185201
logPromptResult(data, options);
186202
} catch (error) {
187203
sessionCache.delete(cacheKey);
@@ -195,7 +211,7 @@ const opencodeAgent: AgentDefinition = {
195211
throw error;
196212
}
197213

198-
return { command: displayCommand };
214+
return { command: displayCommand, actions, usage };
199215
},
200216
};
201217

0 commit comments

Comments
 (0)