Skip to content

Commit 1394e2d

Browse files
committed
Merge branch 'main' into tmd/add-judges-summary
2 parents 6d002ca + d34556b commit 1394e2d

File tree

13 files changed

+368
-229
lines changed

13 files changed

+368
-229
lines changed

.github/workflows/benchmark-reusable.yml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,19 @@ jobs:
134134
COMMAND="bunx \"${PACKAGE_SPEC}\" \"${BENCHMARK_AGENT}\" --eval \"${BENCHMARK_EVAL}\" --model \"${BENCHMARK_MODEL}\" --output \"${OUTPUT_FILE}\""
135135
136136
echo "Executing: ${COMMAND}"
137-
if ! bunx "${PACKAGE_SPEC}" "${BENCHMARK_AGENT}" --eval "${BENCHMARK_EVAL}" --model "${BENCHMARK_MODEL}" --output "${OUTPUT_FILE}"; then
138-
echo "openreval benchmark failed, dumping OpenCode logs..." >&2
139-
find /home/runner/.local/share/opencode/log -type f -print -exec cat {} + || true
140-
exit 1
141-
fi
137+
bunx "${PACKAGE_SPEC}" "${BENCHMARK_AGENT}" --eval "${BENCHMARK_EVAL}" --model "${BENCHMARK_MODEL}" --output "${OUTPUT_FILE}"
138+
139+
- name: Log benchmark summary
140+
if: always()
141+
run: |
142+
set -euo pipefail
143+
if [ -f benchmark.json ]; then
144+
echo "=== Benchmark Summary ==="
145+
jq -r '.summary // "No summary available"' benchmark.json
146+
echo "========================"
147+
else
148+
echo "benchmark.json not found, skipping summary log"
149+
fi
142150
143151
- name: Generate Judges Summary for this Evaluation
144152
env:

.github/workflows/compare-models.yml

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,10 @@ name: Compare Models
22
on:
33
workflow_dispatch:
44
inputs:
5-
codex_gpt_5_codex:
6-
description: codex:gpt-5-codex
7-
type: boolean
8-
default: false
95
opencode_opencode_gpt_5_codex:
106
description: opencode:opencode/gpt-5-codex
117
type: boolean
128
default: false
13-
opencode_opencode_claude_sonnet_4_5:
14-
description: opencode:opencode/claude-sonnet-4-5
15-
type: boolean
16-
default: false
17-
opencode_opencode_big_pickle:
18-
description: opencode:opencode/big-pickle
19-
type: boolean
20-
default: false
21-
claude_code_claude_sonnet_4_5:
22-
description: claude-code:claude-sonnet-4-5
23-
type: boolean
24-
default: false
259
permissions:
2610
contents: read
2711
actions: read

agents/claude-code.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import type {
1212
const sessionCache = new Map<string, string>();
1313

1414
export const models: string[] = [
15-
"claude-sonnet-4-5",
15+
// "claude-sonnet-4-5",
1616
// "claude-sonnet-4",
1717
// "claude-opus-4-1",
1818
// "claude-3-5-haiku",
@@ -106,6 +106,12 @@ const claudeCodeAgent: AgentDefinition = {
106106
const cacheKey = sessionKey(cwd, model);
107107
const existingSessionID = sessionCache.get(cacheKey);
108108

109+
const actions: string[] = [];
110+
const usage = {
111+
input: 0,
112+
output: 0,
113+
};
114+
109115
try {
110116
const result = query({
111117
prompt,
@@ -122,6 +128,14 @@ const claudeCodeAgent: AgentDefinition = {
122128
for await (const message of result) {
123129
// Extract and cache session ID from messages
124130
sessionCache.set(cacheKey, message.session_id);
131+
132+
// Accumulate token usage if available (only SDKResultMessage has usage)
133+
if (message.type === "result" && "usage" in message) {
134+
usage.input += message.usage.input_tokens || 0;
135+
usage.output += message.usage.output_tokens || 0;
136+
}
137+
138+
actions.push(JSON.stringify(message));
125139
logJson(message, options);
126140
}
127141
} catch (error) {
@@ -137,7 +151,7 @@ const claudeCodeAgent: AgentDefinition = {
137151
throw error;
138152
}
139153

140-
return { command: displayCommand };
154+
return { command: displayCommand, actions, usage };
141155
},
142156
};
143157

agents/codex.ts

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import process from "node:process";
33

44
import {
55
Codex,
6+
Usage,
67
type CommandExecutionItem,
78
type SandboxMode,
89
type Thread,
@@ -21,7 +22,7 @@ const codexClient = new Codex();
2122
const threadCache = new Map<string, Thread>();
2223

2324
export const models: string[] = [
24-
"gpt-5-codex",
25+
// "gpt-5-codex",
2526
// "gpt-5",
2627
// "o3",
2728
// "o4-mini"
@@ -122,15 +123,27 @@ const codexAgent: AgentDefinition = {
122123
const key = sessionKey(model, cwd);
123124
const thread = getOrCreateThread(model, cwd);
124125

126+
const actions: string[] = [];
127+
let usage: Usage;
125128
try {
126129
const turn = await thread.run(prompt);
130+
assert(turn.usage, "The agent did not emit the usage information.");
131+
usage = turn.usage;
132+
actions.push(...turn.items.map((item) => JSON.stringify(item)));
127133
logTurnItems(turn.items, options);
128134
} catch (error) {
129135
threadCache.delete(key);
130136
throw error;
131137
}
132138

133-
return { command: displayCommand };
139+
return {
140+
command: displayCommand,
141+
actions,
142+
usage: {
143+
input: usage.input_tokens,
144+
output: usage.output_tokens,
145+
},
146+
};
134147
},
135148
};
136149

agents/opencode.ts

Lines changed: 85 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,23 +22,58 @@ const DEFAULT_PERMISSION_CONFIG: NonNullable<OpencodeConfig["permission"]> = {
2222
webfetch: "allow",
2323
};
2424

25+
// Custom fetch with 25-minute timeout
26+
const customFetch = async (request: Request): Promise<Response> => {
27+
const startTime = Date.now();
28+
29+
try {
30+
const response = await fetch(request, {
31+
signal: AbortSignal.timeout(1_500_000),
32+
});
33+
const duration = Date.now() - startTime;
34+
console.error(`[opencode] Request completed - Duration: ${duration}ms`);
35+
return response;
36+
} catch (error) {
37+
const duration = Date.now() - startTime;
38+
console.error(`[opencode] Request failed - Duration: ${duration}ms`);
39+
throw error;
40+
}
41+
};
42+
2543
const opencodePort = await detectPort(4096);
2644

45+
// Set OpenCode config before server starts to ensure timeout is applied
46+
const opencodeConfig = {
47+
permission: DEFAULT_PERMISSION_CONFIG,
48+
provider: {
49+
opencode: {
50+
options: {
51+
timeout: false as false, // Disable timeout for OpenCode provider requests
52+
},
53+
},
54+
},
55+
} satisfies OpencodeConfig;
56+
57+
// CRITICAL: Set via environment variable BEFORE importing/creating anything
58+
// The SDK reads this when spawning the server process
59+
const configJson = JSON.stringify(opencodeConfig);
60+
process.env.OPENCODE_CONFIG_CONTENT = configJson;
61+
62+
console.error(`[opencode] Setting config: ${configJson}`);
63+
2764
const opencode = await createOpencode({
2865
port: opencodePort,
29-
config: {
30-
permission: DEFAULT_PERMISSION_CONFIG,
31-
},
66+
timeout: 1_500_000, // 25 minutes timeout for server startup
67+
config: opencodeConfig,
3268
});
33-
process.once("beforeExit", () => opencode.server.close());
3469

3570
const sessionCache = new Map<string, string>();
3671

3772
export const models: string[] = [
3873
// "opencode/gpt-5",
3974
"opencode/gpt-5-codex",
40-
"opencode/claude-sonnet-4-5",
41-
"opencode/big-pickle",
75+
// "opencode/claude-sonnet-4-5",
76+
// "opencode/big-pickle",
4277
// "opencode/claude-sonnet-4",
4378
// "opencode/claude-3-5-haiku",
4479
// "opencode/claude-opus-4-1",
@@ -73,33 +108,36 @@ function writeLog(
73108

74109
function logJson(value: unknown, options: AgentRunOptions | undefined): void {
75110
try {
76-
writeLog(process.stdout, JSON.stringify(value), options?.logPrefix);
111+
const message = JSON.stringify(value);
112+
writeLog(process.stdout, message, options?.logPrefix);
77113
} catch (error) {
78114
const reason = error instanceof Error ? error.message : String(error);
79-
writeLog(
80-
process.stdout,
81-
JSON.stringify({ error: "serialization_failed", reason }),
82-
options?.logPrefix,
83-
);
115+
const errorMessage = JSON.stringify({
116+
error: "serialization_failed",
117+
reason,
118+
});
119+
writeLog(process.stdout, errorMessage, options?.logPrefix);
84120
}
85121
}
86122

87123
function logError(value: unknown, options: AgentRunOptions | undefined): void {
88124
try {
89-
writeLog(process.stderr, JSON.stringify(value), options?.logPrefix);
125+
const message = JSON.stringify(value);
126+
writeLog(process.stderr, message, options?.logPrefix);
90127
} catch (error) {
91128
const reason = error instanceof Error ? error.message : String(error);
92-
writeLog(
93-
process.stderr,
94-
JSON.stringify({ error: "serialization_failed", reason }),
95-
options?.logPrefix,
96-
);
129+
const errorMessage = JSON.stringify({
130+
error: "serialization_failed",
131+
reason,
132+
});
133+
writeLog(process.stderr, errorMessage, options?.logPrefix);
97134
}
98135
}
99136

100137
function logPromptResult(
101138
result: { info: AssistantMessage; parts: Part[] },
102139
options: AgentRunOptions | undefined,
140+
logs?: string[],
103141
): void {
104142
logJson({ info: result.info }, options);
105143
if (Array.isArray(result.parts)) {
@@ -167,8 +205,14 @@ const opencodeAgent: AgentDefinition = {
167205
sessionCache.set(cacheKey, sessionID);
168206
}
169207

208+
const actions: string[] = [];
209+
const usage = {
210+
input: 0,
211+
output: 0,
212+
};
170213
try {
171214
const [providerID, modelID] = model.split("/");
215+
172216
const { data } = await opencode.client.session.prompt({
173217
path: { id: sessionID! },
174218
query: { directory: cwd },
@@ -180,10 +224,29 @@ const opencodeAgent: AgentDefinition = {
180224
parts: [{ type: "text", text: prompt }],
181225
},
182226
throwOnError: true,
227+
fetch: customFetch,
183228
});
184229

230+
if (data.info?.tokens) {
231+
usage.input = data.info.tokens.input || 0;
232+
usage.output = data.info.tokens.output || 0;
233+
} else {
234+
console.error(
235+
`[opencode] WARNING: No token usage in response. Available fields: ${Object.keys(data.info || {}).join(", ")}`,
236+
);
237+
}
238+
239+
actions.push(JSON.stringify(data.info));
240+
if (Array.isArray(data.parts)) {
241+
data.parts.forEach((part) => actions.push(JSON.stringify(part)));
242+
}
243+
185244
logPromptResult(data, options);
186245
} catch (error) {
246+
console.error(
247+
`[opencode] Error in ${model}:`,
248+
error instanceof Error ? error.message : String(error),
249+
);
187250
sessionCache.delete(cacheKey);
188251
logError(
189252
{
@@ -195,7 +258,10 @@ const opencodeAgent: AgentDefinition = {
195258
throw error;
196259
}
197260

198-
return { command: displayCommand };
261+
return { command: displayCommand, actions, usage };
262+
},
263+
cleanup() {
264+
opencode.server.close();
199265
},
200266
};
201267

bun.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)