Skip to content

Commit 8c89007

Browse files
committed
wip
1 parent 605f704 commit 8c89007

File tree

6 files changed

+27
-59
lines changed

6 files changed

+27
-59
lines changed

agents/codex.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,14 @@ function logTurnItems(
7070
try {
7171
writeLog(process.stdout, JSON.stringify(item), options?.logPrefix);
7272
} catch (error) {
73-
const fallback = isCommandExecutionItem(item)
73+
const sanitizedItem = isCommandExecutionItem(item)
7474
? { ...item, aggregated_output: "<omitted>" }
7575
: item;
76-
writeLog(process.stdout, JSON.stringify(fallback), options?.logPrefix);
76+
writeLog(
77+
process.stdout,
78+
JSON.stringify(sanitizedItem),
79+
options?.logPrefix,
80+
);
7781
if (error instanceof Error) {
7882
writeLog(
7983
process.stderr,

agents/opencode.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ const sessionCache = new Map<string, string>();
7171

7272
export const models: string[] = [
7373
// "opencode/gpt-5",
74-
"opencode/gpt-5-codex",
75-
// "opencode/claude-sonnet-4-5",
76-
// "opencode/big-pickle",
74+
// "opencode/gpt-5-codex",
75+
"opencode/claude-sonnet-4-5",
76+
"opencode/big-pickle",
7777
// "opencode/claude-sonnet-4",
7878
// "opencode/claude-3-5-haiku",
7979
// "opencode/claude-opus-4-1",

judges.ts

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
import type { Judge } from "~/lib/judgeTypes.js";
22
import { getZenLanguageModel } from "~/lib/zenModels.js";
33

4-
const fallback = (envName: string, defaultValue: string): string =>
5-
process.env[envName]?.trim() || defaultValue;
6-
74
function resolveJudgeModelId(judgeName: Judge["name"]): string {
85
switch (judgeName) {
96
case "claude-4.5":
10-
return fallback("CLAUDE_MODEL", "opencode/claude-sonnet-4-5");
7+
return "opencode/claude-sonnet-4-5";
118
case "gpt-5-codex":
12-
return fallback("GPT5_CODEX_MODEL", "opencode/gpt-5-codex");
9+
return "opencode/gpt-5-codex";
1310
case "kimi":
14-
return fallback("KIMI_MODEL", "opencode/kimi-k2");
11+
return "opencode/kimi-k2";
1512
default:
1613
return judgeName;
1714
}

lib/planner.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@ import type { DatasetEval } from "~/lib/dataset.js";
55
import { plannerExamples } from "~/lib/plannerExamples.js";
66
import { getZenLanguageModel } from "~/lib/zenModels.js";
77

8-
const fallback = (envName: string, defaultValue: string): string =>
9-
process.env[envName]?.trim() || defaultValue;
10-
118
export interface PlannerCommitDiff {
129
sha: string;
1310
title: string;
@@ -61,7 +58,7 @@ What NOT to include:
6158
6259
Always respond strictly as JSON conforming to the schema. Do not add commentary.`;
6360

64-
const plannerModelId = fallback("PLANNER_MODEL", "opencode/claude-sonnet-4-5");
61+
const plannerModelId = "opencode/claude-sonnet-4-5";
6562

6663
function buildSystemPrompt(): string {
6764
if (plannerExamples.length === 0) {

lib/summarizer.ts

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@ import { z } from "zod";
44
import type { DatasetEval } from "~/lib/dataset.js";
55
import { getZenLanguageModel } from "~/lib/zenModels.js";
66

7-
const fallback = (envName: string, defaultValue: string): string =>
8-
process.env[envName]?.trim() || defaultValue;
9-
107
export interface EpisodeActions {
118
episodeIndex: number;
129
actions: string[];
@@ -43,10 +40,7 @@ Guidelines:
4340
- Note any errors or issues encountered
4441
- Be objective and descriptive, not evaluative`;
4542

46-
const summarizerModelId = fallback(
47-
"SUMMARIZER_MODEL",
48-
"opencode/claude-sonnet-4-5",
49-
);
43+
const summarizerModelId = "opencode/claude-sonnet-4-5";
5044

5145
export async function generateActionsSummary(
5246
evaluation: DatasetEval,

scripts/analysis.ts

Lines changed: 13 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,7 @@ Focus on concrete observations from the data provided. Look for patterns such as
3333
3434
Provide a concise, insightful analysis that helps developers understand agent behavior and improve the evaluation system.`;
3535

36-
const fallback = (envName: string, defaultValue: string): string =>
37-
process.env[envName]?.trim() || defaultValue;
38-
39-
const analyzerModelId = fallback(
40-
"ANALYZER_MODEL",
41-
"opencode/claude-sonnet-4-5",
42-
);
36+
const analyzerModelId = "opencode/claude-sonnet-4-5";
4337

4438
function buildDynamicContext(runs: EvaluationRunExport[]): string {
4539
const repo = runs[0].evaluation.repo;
@@ -99,25 +93,16 @@ function formatFallbackSummary(runs: EvaluationRunExport[]): string {
9993
return lines.join("\n").trimEnd();
10094
}
10195

102-
async function generateAnalysis(
103-
runs: EvaluationRunExport[],
104-
): Promise<string> {
96+
async function generateAnalysis(runs: EvaluationRunExport[]): Promise<string> {
10597
const context = buildDynamicContext(runs);
10698

107-
try {
108-
const { text } = await generateText({
109-
model: getZenLanguageModel(analyzerModelId),
110-
system: AGENT_ANALYSIS_PROMPT,
111-
prompt: context,
112-
temperature: 0.3,
113-
});
114-
return text.trim();
115-
} catch (error) {
116-
const message = error instanceof Error ? error.message : String(error);
117-
return `Failed to generate AI analysis (${message}).\n\n${formatFallbackSummary(
118-
runs,
119-
)}`;
120-
}
99+
const { text } = await generateText({
100+
model: getZenLanguageModel(analyzerModelId),
101+
system: AGENT_ANALYSIS_PROMPT,
102+
prompt: context,
103+
temperature: 0.3,
104+
});
105+
return text.trim();
121106
}
122107

123108
function usage(): void {
@@ -137,33 +122,24 @@ async function main(): Promise<void> {
137122
}
138123

139124
const filePath = args[0];
140-
let parsed: unknown;
125+
let runs: EvaluationRunExport[];
141126

142127
try {
143128
const fileContent = readFileSync(filePath, "utf-8");
144-
parsed = JSON.parse(fileContent) as unknown;
129+
runs = JSON.parse(fileContent) as EvaluationRunExport[];
145130
} catch (error) {
146131
console.error(`Error reading file ${filePath}:`, error);
147132
process.exit(1);
148133
}
149134

150-
const runs = (Array.isArray(parsed)
151-
? (parsed as EvaluationRunExport[])
152-
: [parsed as EvaluationRunExport]).filter(
153-
(run): run is EvaluationRunExport =>
154-
run != null && typeof run === "object" && "finalScore" in run,
155-
);
156-
157135
if (runs.length === 0) {
158136
console.error("No evaluation runs found in the provided file.");
159137
process.exit(1);
160138
}
161139

162-
const orderedRuns = [...runs].sort(
163-
(a, b) => b.finalScore - a.finalScore,
164-
);
140+
runs.sort((a, b) => b.finalScore - a.finalScore);
165141

166-
const output = await generateAnalysis(orderedRuns);
142+
const output = await generateAnalysis(runs);
167143
process.stdout.write(`${output.trimEnd()}\n`);
168144
}
169145

0 commit comments

Comments
 (0)