Skip to content

Commit 3f46bf3

Browse files
cursoragentfrank
andcommitted
Refactor: Centralize benchmark model configuration
Move benchmark model definitions to a new config file and update scripts to use it. Co-authored-by: frank <frank@anomalyinnovations.com>
1 parent bec3f65 commit 3f46bf3

File tree

8 files changed

+42
-57
lines changed

8 files changed

+42
-57
lines changed

scripts/benchmark-config.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
export const BENCHMARK_MODELS: Record<string, string[]> = {
2+
"claude-code": [
3+
"claude-sonnet-4-5",
4+
"claude-opus-4-5",
5+
],
6+
codex: [
7+
"gpt-5-codex",
8+
"gpt-5.1-codex",
9+
],
10+
opencode: [
11+
"opencode/gpt-5-codex",
12+
"opencode/gpt-5.1-codex",
13+
"opencode/claude-sonnet-4-5",
14+
"opencode/claude-opus-4-5",
15+
"opencode/glm-4.6",
16+
"opencode/glm-4.7-free",
17+
"opencode/gemini-3-pro",
18+
"opencode/qwen3-coder",
19+
"opencode/kimi-k2",
20+
"opencode/grok-code",
21+
"opencode/alpha-gd4",
22+
],
23+
};

scripts/generate-benchmark-matrix.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
#!/usr/bin/env bun
22
import { Agent } from "~/agents/index.js";
33
import { Task } from "~/src/tasks/index.js";
4+
import { BENCHMARK_MODELS } from "./benchmark-config.js";
45

56
const agents = Agent.list();
67
const tasks = await Task.listNames();
78
const include = tasks.flatMap((task) =>
8-
agents.flatMap((agent) =>
9-
agent.models.map((model) => ({
9+
agents.flatMap((agent) => {
10+
const models = BENCHMARK_MODELS[agent.name] || [];
11+
return models.map((model) => ({
1012
eval: task,
1113
model,
1214
agent: agent.name,
13-
})),
14-
),
15+
}));
16+
}),
1517
);
1618

1719
const matrix = JSON.stringify({ include });

scripts/sync-workflow-inputs.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import { readFileSync, writeFileSync } from "node:fs";
1111
import { Agent } from "~/agents/index.js";
12+
import { BENCHMARK_MODELS } from "./benchmark-config.js";
1213
import YAML from "yaml";
1314

1415
interface WorkflowInput {
@@ -43,7 +44,8 @@ async function main(): Promise<void> {
4344
const combinations: Array<{ agent: string; model: string }> = [];
4445

4546
for (const agent of agents) {
46-
for (const model of agent.models) {
47+
const models = BENCHMARK_MODELS[agent.name] || [];
48+
for (const model of models) {
4749
combinations.push({ agent: agent.name, model });
4850
}
4951
}

src/agents/claude-code.ts

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,6 @@ import { Logger } from "../util/logger.js";
88

99
const sessionCache = new Map<string, string>();
1010

11-
export const models: string[] = [
12-
"claude-sonnet-4-5",
13-
"claude-opus-4-5",
14-
// "claude-sonnet-4",
15-
// "claude-opus-4-1",
16-
// "claude-3-5-haiku",
17-
];
18-
1911
function sessionKey(model: string, cwd: string): string {
2012
return `${cwd}::${model}`;
2113
}

src/agents/codex.ts

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,6 @@ const DEFAULT_SANDBOX: SandboxMode = "workspace-write";
1616
const codexClient = new Codex();
1717
const threadCache = new Map<string, Thread>();
1818

19-
export const models = [
20-
"gpt-5-codex",
21-
"gpt-5.1-codex",
22-
// "gpt-5",
23-
// "o3",
24-
// "o4-mini"
25-
] as const;
26-
2719
function sessionKey(model: string, cwd: string): string {
2820
return `${cwd}::${model}`;
2921
}
@@ -67,7 +59,7 @@ function getOrCreateThread(model: string, cwd: string): Thread {
6759
return thread;
6860
}
6961

70-
const codexAgent: Agent.Definition<(typeof models)[number]> = {
62+
const codexAgent: Agent.Definition = {
7163
async run(model, prompt, options) {
7264
options.logger.log(
7365
`codex-sdk --model ${model} --sandbox ${DEFAULT_SANDBOX} ${prompt}`,

src/agents/index.ts

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ export namespace Agent {
2020
prompt: Prompt,
2121
) => CommandSpec | Promise<CommandSpec>;
2222

23-
export interface Definition<TModel extends string = string> {
23+
export interface Definition {
2424
run: (
25-
model: TModel,
25+
model: string,
2626
prompt: Prompt,
2727
options: RunOptions,
2828
) => Promise<RunResult>;
@@ -43,33 +43,29 @@ export namespace Agent {
4343
logger: Logger.Instance;
4444
}
4545

46-
export interface Registration<TModel extends string = string> {
46+
export interface Registration {
4747
name: string;
48-
definition: Definition<TModel>;
49-
models: ReadonlyArray<TModel>;
48+
definition: Definition;
5049
}
5150

52-
const agents: Record<string, Registration<any>> = {
51+
const agents: Record<string, Registration> = {
5352
// Only keep opencode active while debugging timeouts for specific models.
5453
opencode: createRegistration("opencode", opencodeAgent),
5554
//codex: createRegistration("codex", codexAgent),
5655
//"claude-code": createRegistration("claude-code", claudeCodeAgent),
5756
};
5857

59-
function createRegistration<TModel extends string>(
58+
function createRegistration(
6059
name: string,
6160
module: {
62-
default?: Definition<TModel>;
63-
models?: ReadonlyArray<TModel>;
61+
default?: Definition;
6462
},
65-
): Registration<TModel> {
63+
): Registration {
6664
const definition = module.default;
67-
const models = module.models;
6865

6966
assert(definition, `Agent module ${name} is missing a default export.`);
70-
assert(models, `Agent module ${name} is missing the exported models list.`);
7167

72-
return { name, definition, models };
68+
return { name, definition };
7369
}
7470

7571
export function get(name: string): Registration {
@@ -78,13 +74,6 @@ export namespace Agent {
7874
return agent;
7975
}
8076

81-
export function validateModel(agent: Registration, model: string) {
82-
if (!agent.models.find((entry) => entry === model))
83-
throw new Error(
84-
`Model ${model} is not registered for agent ${agent.name}.`,
85-
);
86-
}
87-
8877
export function list() {
8978
return Object.values(agents);
9079
}

src/agents/opencode.ts

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,6 @@ const opencode = await createOpencode({
3636

3737
const sessionCache = new Map<string, string>();
3838

39-
export const models: string[] = [
40-
"opencode/gpt-5-codex",
41-
"opencode/gpt-5.1-codex",
42-
"opencode/claude-sonnet-4-5",
43-
"opencode/claude-opus-4-5",
44-
"opencode/glm-4.6",
45-
"opencode/glm-4.7-free",
46-
"opencode/gemini-3-pro",
47-
"opencode/qwen3-coder",
48-
"opencode/kimi-k2",
49-
"opencode/grok-code",
50-
"opencode/alpha-gd4",
51-
];
52-
5339
function sessionKey(model: string, cwd: string): string {
5440
return `${cwd}::${model}`;
5541
}

src/eval.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ export namespace Eval {
4747
},
4848
) {
4949
const agent = Agent.get(agentName);
50-
Agent.validateModel(agent, modelId);
5150
const task = await Task.get(taskId);
5251
const cwd = await mkdtemp(join(tmpdir(), "openreval-"));
5352
$.cwd(cwd);

0 commit comments

Comments
 (0)