Skip to content

Commit f24df89

Browse files
author
Frank
committed
sync
1 parent f375c70 commit f24df89

39 files changed

+540
-699
lines changed

cli.ts

Lines changed: 76 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,11 @@ import yargs from "yargs";
88
import { hideBin } from "yargs/helpers";
99
import { Agent } from "~/agents/index.js";
1010
import { scores as scoreRegistry } from "~/scores/index.js";
11-
import { dataset } from "~/lib/dataset.js";
12-
import type { DatasetEval, ScoreAssignment } from "~/lib/dataset.js";
13-
import { generatePromptsForEval, Task } from "~/lib/prompts.js";
11+
import { Eval } from "~/evals/index.js";
1412
import {
1513
generateActionsSummary,
1614
type EpisodeActions,
1715
} from "~/lib/summarizer.js";
18-
import { loadPromptsFile } from "~/lib/prompts.js";
1916
import { judges, getJudgeModelId } from "~/judges.js";
2017
import { aggregateScores } from "~/lib/utils/scoreAggregation.js";
2118
import type { Judge } from "~/lib/judgeTypes.js";
@@ -30,10 +27,6 @@ import { Logger } from "./lib/logger.js";
3027

3128
type ModelCombination = string;
3229

33-
const evalIds = dataset
34-
.map((entry) => entry.identifier)
35-
.sort((a, b) => a.localeCompare(b));
36-
3730
const cli = yargs(hideBin(process.argv))
3831
.scriptName("orvl")
3932
.wrap(null)
@@ -55,40 +48,22 @@ const cli = yargs(hideBin(process.argv))
5548
.strict();
5649

5750
cli.command(
58-
"prompts",
59-
"Generate prompts for a specific evaluation",
60-
(yargs) =>
61-
yargs
62-
.option("eval", {
63-
type: "string",
64-
description: "eval to use in the format of repo@from..to",
65-
choices: evalIds,
66-
})
67-
.example([
68-
["orvl prompts", "Generate prompts for all evaluations"],
69-
[
70-
"orvl prompts --eval DataDog/datadog-lambda-python@93d4a07..d776378",
71-
"Generate prompts for a specific evaluation",
72-
],
73-
]),
51+
"generate",
52+
"Generate dataset for all evaluations",
53+
async (yargs) =>
54+
yargs.example([["orvl generate", "Generate dataset for all evaluations"]]),
7455
async ({ eval: evalId }) => {
75-
const evalDefs = (() => {
76-
if (!evalId) return [...dataset];
77-
const evalDef = dataset.find((entry) => entry.identifier === evalId);
78-
if (!evalDef) throw new Error(`Evaluation not found: ${evalId}`);
79-
return [evalDef];
80-
})();
81-
82-
console.log(`Generating prompts for ${evalDefs.length} evaluation(s)...\n`);
56+
const logger = Logger.create();
57+
logger.log(`Generating dataset...`);
8358

84-
await Promise.all(evalDefs.map(generatePromptsForEval));
59+
await Eval.generate({ logger });
8560
},
8661
);
8762

8863
cli.command(
8964
"$0 [agent]",
9065
"Run benchmark evaluation",
91-
(yargs) =>
66+
async (yargs) =>
9267
yargs
9368
.positional("agent", {
9469
type: "string",
@@ -104,7 +79,6 @@ cli.command(
10479
.option("eval", {
10580
type: "string",
10681
description: "eval to use in the format of repo@from..to",
107-
choices: dataset.map((entry) => entry.identifier),
10882
required: true,
10983
})
11084
.option("episodes", {
@@ -130,10 +104,10 @@ cli.command(
130104
timeout: timeoutMins,
131105
output: outputPath,
132106
}) => {
107+
const evals = await Eval.load();
133108
const agent = getAgent(agentName);
134109
const model = getModel(agent, modelFilter);
135-
const evalDef = getEval(evalId);
136-
const tasks = getTasks(evalDef);
110+
const evalDef = getEval(evals, evalId);
137111
const logger = Logger.create(`[model ${model}]`);
138112

139113
// Run episodes
@@ -143,7 +117,7 @@ cli.command(
143117
const childLogger = logger.child(`[episode ${index}/${episodes}]`);
144118
childLogger.log(`Starting episode with ${timeoutMins}min timeout...`);
145119
return withRetries(
146-
() => runEpisode(evalDef, agent, model, tasks, childLogger),
120+
() => runEpisode(evalDef, agent, model, childLogger),
147121
{
148122
retries: 3,
149123
timeoutMs: timeoutMins * 60 * 1000,
@@ -153,6 +127,9 @@ cli.command(
153127
}),
154128
);
155129

130+
// TODO
131+
console.log(JSON.stringify(settled, null, 2));
132+
156133
const results = settled
157134
.filter((result) => result.status === "fulfilled")
158135
.map((result) => result.value)
@@ -215,8 +192,8 @@ cli.command(
215192
);
216193

217194
printEvalResult(episodeExports, evaluationResult, logger);
218-
buildEvalChart(evaluationResult);
219-
storeEvalResult(evaluationResult, outputPath);
195+
buildEvalChart(evaluationResult, logger);
196+
storeEvalResult(evaluationResult, outputPath, logger);
220197
},
221198
);
222199

@@ -251,8 +228,8 @@ function getModel(agent: Agent.Registration, modelFilter: string) {
251228
return model;
252229
}
253230

254-
function getEval(evalId: string) {
255-
const evalDef = dataset.find((entry) => entry.identifier === evalId);
231+
function getEval(evals: Eval.Instance[], evalId: string) {
232+
const evalDef = evals.find((ev) => ev.id === evalId);
256233
if (!evalDef) throw new Error(`Eval ${evalId} was not found.`);
257234
if (!evalDef.scores.length)
258235
throw new Error(
@@ -261,50 +238,42 @@ function getEval(evalId: string) {
261238
return evalDef;
262239
}
263240

264-
function getTasks(evalDef: DatasetEval) {
265-
const tasks = loadPromptsFile(evalDef.prompts);
266-
if (tasks.length === 0)
267-
throw new Error(
268-
`No prompts found in ${evalDef.prompts} for ${evalDef.repo}.`,
269-
);
270-
return tasks;
271-
}
272-
273241
async function runEpisode(
274-
evalDef: DatasetEval,
242+
ev: Eval.Instance,
275243
agent: Agent.Registration,
276244
model: string,
277-
tasks: Task[],
278245
logger: Logger.Instance,
279246
) {
280247
const cwd = mkdtempSync(join(tmpdir(), "openreval-"));
281248

249+
// validate prompts
250+
if (ev.prompts.length === 0)
251+
throw new Error(`No prompts found in ${ev.prompts} for ${ev.repo}.`);
252+
282253
try {
283254
logger.log(`Cloning repository...`);
284-
cloneRepositoryAtCommit(cwd, evalDef.repo, evalDef.from);
255+
cloneRepositoryAtCommit(cwd, ev.repo, ev.from);
285256

286257
const preparedScores = new Map<string, unknown>();
287-
for (const assignment of evalDef.scores) {
288-
const scoreDefinition = scoreRegistry[assignment.name];
258+
for (const score of ev.scores) {
259+
const scoreDefinition = scoreRegistry[score.name];
289260
if (!scoreDefinition)
290261
throw new Error(
291-
logger.format(`Score ${assignment.name} is not registered.`),
262+
logger.format(`Score ${score.name} is not registered.`),
292263
);
293264

294265
try {
295266
const prepared = await scoreDefinition.prepare({
296-
evaluation: evalDef,
267+
ev: ev,
297268
cwd,
298-
config: assignment.args,
269+
config: score.args,
299270
logger,
300271
});
301-
preparedScores.set(assignment.name, prepared);
272+
preparedScores.set(score.name, prepared);
302273
} catch (error) {
303274
const message = error instanceof Error ? error.message : String(error);
304275
throw new Error(
305-
logger.format(
306-
`Failed to prepare score ${assignment.name}: ${message}`,
307-
),
276+
logger.format(`Failed to prepare score ${score.name}: ${message}`),
308277
);
309278
}
310279
}
@@ -314,14 +283,14 @@ async function runEpisode(
314283
const usage = { input: 0, output: 0, cost: 0 };
315284
const episodeActions: string[] = [];
316285

317-
for (const task of tasks) {
286+
for (const prompt of ev.prompts) {
318287
const childLogger = logger.child(
319-
`[task ${evalDef.repo.split("/")[1]}@${task.commit.slice(0, 7)}]`,
288+
`[prompt ${ev.repo.split("/")[1]}@${prompt.commit.slice(0, 7)}]`,
320289
);
321290

322291
try {
323292
const startedAt = Date.now();
324-
const result = await agent.definition.run(model, task.prompt, cwd!, {
293+
const result = await agent.definition.run(model, prompt.prompt, cwd!, {
325294
logger: childLogger,
326295
});
327296
duration += Date.now() - startedAt;
@@ -337,7 +306,7 @@ async function runEpisode(
337306
const message = error instanceof Error ? error.message : String(error);
338307
throw new Error(
339308
childLogger.format(
340-
`Agent run failed for planner task ${task.commit}: ${message}`,
309+
`Agent run failed for planner task ${prompt.commit}: ${message}`,
341310
),
342311
);
343312
}
@@ -348,7 +317,7 @@ async function runEpisode(
348317
// compare the untouched baseline against the desired target.
349318

350319
const episodeAggregation = await collectAggregationInputsForRun(
351-
evalDef,
320+
ev,
352321
model,
353322
cwd,
354323
preparedScores,
@@ -400,24 +369,33 @@ function printEvalResult(
400369

401370
if (!evalExport.exportData) return;
402371

403-
const { finalScore, baseScore, variancePenalty } = evalExport.exportData;
372+
const formatEpisode = (final: number, base: number, penalty: number) =>
373+
`final ${final.toFixed(3)} (base ${base.toFixed(
374+
3,
375+
)} - penalty ${penalty.toFixed(3)})`;
376+
404377
logger.log(
405378
"Episode recap:",
406-
episodes.map(
407-
(episode, index) =>
408-
` Episode ${index + 1}: final ${episode.finalScore.toFixed(
409-
3,
410-
)} (base ${episode.baseScore.toFixed(
411-
3,
412-
)} - penalty ${episode.variancePenalty.toFixed(3)})`,
379+
...episodes.map(
380+
(episode, i) =>
381+
` Episode ${i + 1}: ${formatEpisode(
382+
episode.finalScore,
383+
episode.baseScore,
384+
episode.variancePenalty,
385+
)}`,
413386
),
414-
`Aggregate final: ${finalScore.toFixed(3)} (base ${baseScore.toFixed(
415-
3,
416-
)} - penalty ${variancePenalty.toFixed(3)})`,
387+
`Aggregate final: ${formatEpisode(
388+
evalExport.exportData.finalScore,
389+
evalExport.exportData.baseScore,
390+
evalExport.exportData.variancePenalty,
391+
)}`,
417392
);
418393
}
419394

420-
function buildEvalChart(evalExport: ReturnType<typeof summarizeAggregation>) {
395+
function buildEvalChart(
396+
evalExport: ReturnType<typeof summarizeAggregation>,
397+
logger: Logger.Instance,
398+
) {
421399
const chartUrl = buildRadarChartUrl({
422400
labels: evalExport.exportData.scores.map((s) => s.assignment.name),
423401
values: evalExport.exportData.scores.map((s) =>
@@ -426,12 +404,13 @@ function buildEvalChart(evalExport: ReturnType<typeof summarizeAggregation>) {
426404
title: `${evalExport.exportData.evaluation.repo}${evalExport.exportData.model}`,
427405
datasetLabel: evalExport.exportData.model,
428406
});
429-
console.log(`\nRadar Chart: ${chartUrl}\n`);
407+
logger.log(`Radar Chart: ${chartUrl}\n`);
430408
}
431409

432410
function storeEvalResult(
433411
evalExport: ReturnType<typeof summarizeAggregation>,
434-
outputPath?: string,
412+
outputPath: string | undefined,
413+
logger: Logger.Instance,
435414
) {
436415
if (!outputPath) return;
437416

@@ -443,9 +422,9 @@ function storeEvalResult(
443422

444423
writeFileSync(outputPath, JSON.stringify(evalExport.exportData, null, 2));
445424
} catch (error) {
446-
const message =
447-
error instanceof Error ? error.message : "Unknown error writing output.";
448-
throw new Error(`Failed to write export to ${outputPath}: ${message}`);
425+
throw new Error(
426+
logger.format(`Failed to write export to ${outputPath}:`, error),
427+
);
449428
}
450429
}
451430

@@ -470,7 +449,7 @@ function cleanupRepository(cwd: string, logger: Logger.Instance): void {
470449
}
471450

472451
async function collectAggregationInputsForRun(
473-
datasetEval: DatasetEval,
452+
ev: Eval.Instance,
474453
model: ModelCombination,
475454
cwd: string,
476455
preparedReferences: Map<string, unknown>,
@@ -479,7 +458,7 @@ async function collectAggregationInputsForRun(
479458
const aggregationInputs = new Map<string, ScoreAggregationInput>();
480459

481460
for (const judge of judges) {
482-
for (const assignment of datasetEval.scores) {
461+
for (const assignment of ev.scores) {
483462
const scoreDefinition = scoreRegistry[assignment.name];
484463

485464
if (!scoreDefinition) {
@@ -502,7 +481,7 @@ async function collectAggregationInputsForRun(
502481
const result = await scoreDefinition.evaluate({
503482
judge,
504483
reference,
505-
evaluation: datasetEval,
484+
ev: ev,
506485
cwd,
507486
config: assignment.args,
508487
logger,
@@ -537,15 +516,15 @@ async function collectAggregationInputsForRun(
537516

538517
function summarizeAggregation(
539518
agentName: string,
540-
datasetEval: DatasetEval,
519+
ev: Eval.Instance,
541520
model: ModelCombination,
542521
aggregationInputs: Map<string, ScoreAggregationInput>,
543522
episodes: Episode[],
544523
usage: Usage,
545524
summary: string,
546525
duration: number,
547-
): { lines: string[]; exportData: EvaluationRunExport } {
548-
const evalId = datasetEval.repo;
526+
) {
527+
const evalId = ev.repo;
549528

550529
const aggregation = aggregateScores(Array.from(aggregationInputs.values()));
551530

@@ -591,10 +570,10 @@ function summarizeAggregation(
591570
const exportData: EvaluationRunExport = {
592571
agent: agentName,
593572
evaluation: {
594-
identifier: datasetEval.identifier,
595-
repo: datasetEval.repo,
596-
from: datasetEval.from,
597-
to: datasetEval.to,
573+
identifier: ev.id,
574+
repo: ev.repo,
575+
from: ev.from,
576+
to: ev.to,
598577
},
599578
model,
600579
jobUrl: process.env.GITHUB_BENCHMARK_JOB_URL!,
@@ -649,12 +628,7 @@ function buildScoreExportsFromEpisodes(
649628

650629
episodes.forEach((episode) => {
651630
episode.scores.forEach((score) => {
652-
const assignment: ScoreAssignment = {
653-
name: score.assignment.name,
654-
weight: score.assignment.weight,
655-
args: score.assignment.args,
656-
};
657-
const entry = ensureAggregationEntry(aggregationInputs, assignment);
631+
const entry = ensureAggregationEntry(aggregationInputs, score.assignment);
658632

659633
score.judges.forEach((judgeResult) => {
660634
const judge: Judge = {
@@ -677,7 +651,7 @@ function buildScoreExportsFromEpisodes(
677651

678652
function ensureAggregationEntry(
679653
map: Map<string, ScoreAggregationInput>,
680-
assignment: ScoreAssignment,
654+
assignment: Eval.Instance["scores"][number],
681655
): ScoreAggregationInput {
682656
if (!map.has(assignment.name)) {
683657
map.set(assignment.name, {

0 commit comments

Comments
 (0)