Skip to content

Commit a3b8cb9

Browse files
cursoragentfrank
andcommitted
Refactor: Use env vars for agent/model selection
Remove hardcoded agent/model lists and use environment variables for flexibility. Co-authored-by: frank <frank@anomalyinnovations.com>
1 parent bec3f65 commit a3b8cb9

File tree

8 files changed

+904
-79
lines changed

8 files changed

+904
-79
lines changed

package-lock.json

Lines changed: 851 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/generate-benchmark-matrix.ts

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,32 @@
11
#!/usr/bin/env bun
2-
import { Agent } from "~/agents/index.js";
32
import { Task } from "~/src/tasks/index.js";
43

5-
const agents = Agent.list();
4+
// Models are passed via BENCHMARK_MODELS environment variable (comma-separated agent:model pairs)
5+
// Example: BENCHMARK_MODELS="opencode:opencode/gpt-5-codex,opencode:opencode/claude-sonnet-4-5"
6+
const modelsEnv = process.env.BENCHMARK_MODELS;
7+
if (!modelsEnv) {
8+
console.error(
9+
"Error: BENCHMARK_MODELS environment variable is required (comma-separated agent:model pairs)",
10+
);
11+
process.exit(1);
12+
}
13+
14+
const agentModelPairs = modelsEnv.split(",").map((pair) => {
15+
const [agent, model] = pair.split(":");
16+
if (!agent || !model) {
17+
console.error(`Invalid agent:model pair: ${pair}`);
18+
process.exit(1);
19+
}
20+
return { agent, model };
21+
});
22+
623
const tasks = await Task.listNames();
724
const include = tasks.flatMap((task) =>
8-
agents.flatMap((agent) =>
9-
agent.models.map((model) => ({
10-
eval: task,
11-
model,
12-
agent: agent.name,
13-
})),
14-
),
25+
agentModelPairs.map(({ agent, model }) => ({
26+
eval: task,
27+
model,
28+
agent,
29+
})),
1530
);
1631

1732
const matrix = JSON.stringify({ include });

scripts/sync-workflow-inputs.ts

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
#!/usr/bin/env bun
22
/**
3-
* Syncs workflow_dispatch inputs in compare-models.yml with available agent:model combinations.
4-
* Run this after modifying agent model lists to keep the workflow in sync.
3+
* Syncs workflow_dispatch inputs in compare-models.yml with provided agent:model combinations.
54
*
65
* Usage:
7-
* bun run scripts/sync-workflow-inputs.ts
6+
* WORKFLOW_MODELS="opencode:opencode/gpt-5-codex,opencode:opencode/claude-sonnet-4-5" bun run scripts/sync-workflow-inputs.ts
87
*/
98

109
import { readFileSync, writeFileSync } from "node:fs";
11-
import { Agent } from "~/agents/index.js";
1210
import YAML from "yaml";
1311

1412
interface WorkflowInput {
@@ -34,24 +32,27 @@ function toDescription(agent: string, model: string): string {
3432
async function main(): Promise<void> {
3533
const workflowPath = ".github/workflows/compare-models.yml";
3634

37-
// Load the workflow file
38-
const workflowContent = readFileSync(workflowPath, "utf8");
39-
const workflow = YAML.parse(workflowContent);
40-
41-
// Get all available agent:model combinations
42-
const agents = Agent.list();
43-
const combinations: Array<{ agent: string; model: string }> = [];
35+
// Models are passed via WORKFLOW_MODELS environment variable (comma-separated agent:model pairs)
36+
const modelsEnv = process.env.WORKFLOW_MODELS;
37+
if (!modelsEnv) {
38+
console.error(
39+
"Error: WORKFLOW_MODELS environment variable is required (comma-separated agent:model pairs)",
40+
);
41+
process.exit(1);
42+
}
4443

45-
for (const agent of agents) {
46-
for (const model of agent.models) {
47-
combinations.push({ agent: agent.name, model });
44+
const combinations = modelsEnv.split(",").map((pair) => {
45+
const [agent, model] = pair.split(":");
46+
if (!agent || !model) {
47+
console.error(`Invalid agent:model pair: ${pair}`);
48+
process.exit(1);
4849
}
49-
}
50+
return { agent, model };
51+
});
5052

51-
if (combinations.length === 0) {
52-
console.error("No agent:model combinations found");
53-
process.exit(1);
54-
}
53+
// Load the workflow file
54+
const workflowContent = readFileSync(workflowPath, "utf8");
55+
const workflow = YAML.parse(workflowContent);
5556

5657
// Build new inputs
5758
const inputs: WorkflowInputs = {};

src/agents/claude-code.ts

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,6 @@ import { Logger } from "../util/logger.js";
88

99
const sessionCache = new Map<string, string>();
1010

11-
export const models: string[] = [
12-
"claude-sonnet-4-5",
13-
"claude-opus-4-5",
14-
// "claude-sonnet-4",
15-
// "claude-opus-4-1",
16-
// "claude-3-5-haiku",
17-
];
18-
1911
function sessionKey(model: string, cwd: string): string {
2012
return `${cwd}::${model}`;
2113
}

src/agents/codex.ts

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,6 @@ const DEFAULT_SANDBOX: SandboxMode = "workspace-write";
1616
const codexClient = new Codex();
1717
const threadCache = new Map<string, Thread>();
1818

19-
export const models = [
20-
"gpt-5-codex",
21-
"gpt-5.1-codex",
22-
// "gpt-5",
23-
// "o3",
24-
// "o4-mini"
25-
] as const;
26-
2719
function sessionKey(model: string, cwd: string): string {
2820
return `${cwd}::${model}`;
2921
}
@@ -67,7 +59,7 @@ function getOrCreateThread(model: string, cwd: string): Thread {
6759
return thread;
6860
}
6961

70-
const codexAgent: Agent.Definition<(typeof models)[number]> = {
62+
const codexAgent: Agent.Definition = {
7163
async run(model, prompt, options) {
7264
options.logger.log(
7365
`codex-sdk --model ${model} --sandbox ${DEFAULT_SANDBOX} ${prompt}`,

src/agents/index.ts

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import { strict as assert } from "node:assert";
21
import * as opencodeAgent from "./opencode.js";
32
import * as codexAgent from "./codex.js";
43
import * as claudeCodeAgent from "./claude-code.js";
54
import { Logger } from "../util/logger.js";
5+
import { strict as assert } from "node:assert";
66

77
export namespace Agent {
88
export type Prompt = string;
@@ -43,33 +43,29 @@ export namespace Agent {
4343
logger: Logger.Instance;
4444
}
4545

46-
export interface Registration<TModel extends string = string> {
46+
export interface Registration {
4747
name: string;
48-
definition: Definition<TModel>;
49-
models: ReadonlyArray<TModel>;
48+
definition: Definition;
5049
}
5150

52-
const agents: Record<string, Registration<any>> = {
51+
const agents: Record<string, Registration> = {
5352
// Only keep opencode active while debugging timeouts for specific models.
5453
opencode: createRegistration("opencode", opencodeAgent),
5554
//codex: createRegistration("codex", codexAgent),
5655
//"claude-code": createRegistration("claude-code", claudeCodeAgent),
5756
};
5857

59-
function createRegistration<TModel extends string>(
58+
function createRegistration(
6059
name: string,
6160
module: {
62-
default?: Definition<TModel>;
63-
models?: ReadonlyArray<TModel>;
61+
default?: Definition;
6462
},
65-
): Registration<TModel> {
63+
): Registration {
6664
const definition = module.default;
67-
const models = module.models;
6865

6966
assert(definition, `Agent module ${name} is missing a default export.`);
70-
assert(models, `Agent module ${name} is missing the exported models list.`);
7167

72-
return { name, definition, models };
68+
return { name, definition };
7369
}
7470

7571
export function get(name: string): Registration {
@@ -78,13 +74,6 @@ export namespace Agent {
7874
return agent;
7975
}
8076

81-
export function validateModel(agent: Registration, model: string) {
82-
if (!agent.models.find((entry) => entry === model))
83-
throw new Error(
84-
`Model ${model} is not registered for agent ${agent.name}.`,
85-
);
86-
}
87-
8877
export function list() {
8978
return Object.values(agents);
9079
}

src/agents/opencode.ts

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,6 @@ const opencode = await createOpencode({
3636

3737
const sessionCache = new Map<string, string>();
3838

39-
export const models: string[] = [
40-
"opencode/gpt-5-codex",
41-
"opencode/gpt-5.1-codex",
42-
"opencode/claude-sonnet-4-5",
43-
"opencode/claude-opus-4-5",
44-
"opencode/glm-4.6",
45-
"opencode/glm-4.7-free",
46-
"opencode/gemini-3-pro",
47-
"opencode/qwen3-coder",
48-
"opencode/kimi-k2",
49-
"opencode/grok-code",
50-
"opencode/alpha-gd4",
51-
];
52-
5339
function sessionKey(model: string, cwd: string): string {
5440
return `${cwd}::${model}`;
5541
}

src/eval.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ export namespace Eval {
4747
},
4848
) {
4949
const agent = Agent.get(agentName);
50-
Agent.validateModel(agent, modelId);
5150
const task = await Task.get(taskId);
5251
const cwd = await mkdtemp(join(tmpdir(), "openreval-"));
5352
$.cwd(cwd);

0 commit comments

Comments
 (0)