@@ -8,14 +8,11 @@ import yargs from "yargs";
88import { hideBin } from "yargs/helpers" ;
99import { Agent } from "~/agents/index.js" ;
1010import { scores as scoreRegistry } from "~/scores/index.js" ;
11- import { dataset } from "~/lib/dataset.js" ;
12- import type { DatasetEval , ScoreAssignment } from "~/lib/dataset.js" ;
13- import { generatePromptsForEval , Task } from "~/lib/prompts.js" ;
11+ import { Eval } from "~/evals/index.js" ;
1412import {
1513 generateActionsSummary ,
1614 type EpisodeActions ,
1715} from "~/lib/summarizer.js" ;
18- import { loadPromptsFile } from "~/lib/prompts.js" ;
1916import { judges , getJudgeModelId } from "~/judges.js" ;
2017import { aggregateScores } from "~/lib/utils/scoreAggregation.js" ;
2118import type { Judge } from "~/lib/judgeTypes.js" ;
@@ -30,10 +27,6 @@ import { Logger } from "./lib/logger.js";
3027
3128type ModelCombination = string ;
3229
33- const evalIds = dataset
34- . map ( ( entry ) => entry . identifier )
35- . sort ( ( a , b ) => a . localeCompare ( b ) ) ;
36-
3730const cli = yargs ( hideBin ( process . argv ) )
3831 . scriptName ( "orvl" )
3932 . wrap ( null )
@@ -55,40 +48,22 @@ const cli = yargs(hideBin(process.argv))
5548 . strict ( ) ;
5649
5750cli . command (
58- "prompts" ,
59- "Generate prompts for a specific evaluation" ,
60- ( yargs ) =>
61- yargs
62- . option ( "eval" , {
63- type : "string" ,
64- description : "eval to use in the format of repo@from..to" ,
65- choices : evalIds ,
66- } )
67- . example ( [
68- [ "orvl prompts" , "Generate prompts for all evaluations" ] ,
69- [
70- "orvl prompts --eval DataDog/datadog-lambda-python@93d4a07..d776378" ,
71- "Generate prompts for a specific evaluation" ,
72- ] ,
73- ] ) ,
51+ "generate" ,
52+ "Generate dataset for all evaluations" ,
53+ async ( yargs ) =>
54+ yargs . example ( [ [ "orvl generate" , "Generate dataset for all evaluations" ] ] ) ,
7455 async ( { eval : evalId } ) => {
75- const evalDefs = ( ( ) => {
76- if ( ! evalId ) return [ ...dataset ] ;
77- const evalDef = dataset . find ( ( entry ) => entry . identifier === evalId ) ;
78- if ( ! evalDef ) throw new Error ( `Evaluation not found: ${ evalId } ` ) ;
79- return [ evalDef ] ;
80- } ) ( ) ;
81-
82- console . log ( `Generating prompts for ${ evalDefs . length } evaluation(s)...\n` ) ;
56+ const logger = Logger . create ( ) ;
57+ logger . log ( `Generating dataset...` ) ;
8358
84- await Promise . all ( evalDefs . map ( generatePromptsForEval ) ) ;
59+ await Eval . generate ( { logger } ) ;
8560 } ,
8661) ;
8762
8863cli . command (
8964 "$0 [agent]" ,
9065 "Run benchmark evaluation" ,
91- ( yargs ) =>
66+ async ( yargs ) =>
9267 yargs
9368 . positional ( "agent" , {
9469 type : "string" ,
@@ -104,7 +79,6 @@ cli.command(
10479 . option ( "eval" , {
10580 type : "string" ,
10681 description : "eval to use in the format of repo@from..to" ,
107- choices : dataset . map ( ( entry ) => entry . identifier ) ,
10882 required : true ,
10983 } )
11084 . option ( "episodes" , {
@@ -130,10 +104,10 @@ cli.command(
130104 timeout : timeoutMins ,
131105 output : outputPath ,
132106 } ) => {
107+ const evals = await Eval . load ( ) ;
133108 const agent = getAgent ( agentName ) ;
134109 const model = getModel ( agent , modelFilter ) ;
135- const evalDef = getEval ( evalId ) ;
136- const tasks = getTasks ( evalDef ) ;
110+ const evalDef = getEval ( evals , evalId ) ;
137111 const logger = Logger . create ( `[model ${ model } ]` ) ;
138112
139113 // Run episodes
@@ -143,7 +117,7 @@ cli.command(
143117 const childLogger = logger . child ( `[episode ${ index } /${ episodes } ]` ) ;
144118 childLogger . log ( `Starting episode with ${ timeoutMins } min timeout...` ) ;
145119 return withRetries (
146- ( ) => runEpisode ( evalDef , agent , model , tasks , childLogger ) ,
120+ ( ) => runEpisode ( evalDef , agent , model , childLogger ) ,
147121 {
148122 retries : 3 ,
149123 timeoutMs : timeoutMins * 60 * 1000 ,
@@ -153,6 +127,9 @@ cli.command(
153127 } ) ,
154128 ) ;
155129
130+ // TODO
131+ console . log ( JSON . stringify ( settled , null , 2 ) ) ;
132+
156133 const results = settled
157134 . filter ( ( result ) => result . status === "fulfilled" )
158135 . map ( ( result ) => result . value )
@@ -215,8 +192,8 @@ cli.command(
215192 ) ;
216193
217194 printEvalResult ( episodeExports , evaluationResult , logger ) ;
218- buildEvalChart ( evaluationResult ) ;
219- storeEvalResult ( evaluationResult , outputPath ) ;
195+ buildEvalChart ( evaluationResult , logger ) ;
196+ storeEvalResult ( evaluationResult , outputPath , logger ) ;
220197 } ,
221198) ;
222199
@@ -251,8 +228,8 @@ function getModel(agent: Agent.Registration, modelFilter: string) {
251228 return model ;
252229}
253230
254- function getEval ( evalId : string ) {
255- const evalDef = dataset . find ( ( entry ) => entry . identifier === evalId ) ;
231+ function getEval ( evals : Eval . Instance [ ] , evalId : string ) {
232+ const evalDef = evals . find ( ( ev ) => ev . id === evalId ) ;
256233 if ( ! evalDef ) throw new Error ( `Eval ${ evalId } was not found.` ) ;
257234 if ( ! evalDef . scores . length )
258235 throw new Error (
@@ -261,50 +238,42 @@ function getEval(evalId: string) {
261238 return evalDef ;
262239}
263240
264- function getTasks ( evalDef : DatasetEval ) {
265- const tasks = loadPromptsFile ( evalDef . prompts ) ;
266- if ( tasks . length === 0 )
267- throw new Error (
268- `No prompts found in ${ evalDef . prompts } for ${ evalDef . repo } .` ,
269- ) ;
270- return tasks ;
271- }
272-
273241async function runEpisode (
274- evalDef : DatasetEval ,
242+ ev : Eval . Instance ,
275243 agent : Agent . Registration ,
276244 model : string ,
277- tasks : Task [ ] ,
278245 logger : Logger . Instance ,
279246) {
280247 const cwd = mkdtempSync ( join ( tmpdir ( ) , "openreval-" ) ) ;
281248
249+ // validate prompts
250+ if ( ev . prompts . length === 0 )
251+ throw new Error ( `No prompts found in ${ ev . prompts } for ${ ev . repo } .` ) ;
252+
282253 try {
283254 logger . log ( `Cloning repository...` ) ;
284- cloneRepositoryAtCommit ( cwd , evalDef . repo , evalDef . from ) ;
255+ cloneRepositoryAtCommit ( cwd , ev . repo , ev . from ) ;
285256
286257 const preparedScores = new Map < string , unknown > ( ) ;
287- for ( const assignment of evalDef . scores ) {
288- const scoreDefinition = scoreRegistry [ assignment . name ] ;
258+ for ( const score of ev . scores ) {
259+ const scoreDefinition = scoreRegistry [ score . name ] ;
289260 if ( ! scoreDefinition )
290261 throw new Error (
291- logger . format ( `Score ${ assignment . name } is not registered.` ) ,
262+ logger . format ( `Score ${ score . name } is not registered.` ) ,
292263 ) ;
293264
294265 try {
295266 const prepared = await scoreDefinition . prepare ( {
296- evaluation : evalDef ,
267+ ev : ev ,
297268 cwd,
298- config : assignment . args ,
269+ config : score . args ,
299270 logger,
300271 } ) ;
301- preparedScores . set ( assignment . name , prepared ) ;
272+ preparedScores . set ( score . name , prepared ) ;
302273 } catch ( error ) {
303274 const message = error instanceof Error ? error . message : String ( error ) ;
304275 throw new Error (
305- logger . format (
306- `Failed to prepare score ${ assignment . name } : ${ message } ` ,
307- ) ,
276+ logger . format ( `Failed to prepare score ${ score . name } : ${ message } ` ) ,
308277 ) ;
309278 }
310279 }
@@ -314,14 +283,14 @@ async function runEpisode(
314283 const usage = { input : 0 , output : 0 , cost : 0 } ;
315284 const episodeActions : string [ ] = [ ] ;
316285
317- for ( const task of tasks ) {
286+ for ( const prompt of ev . prompts ) {
318287 const childLogger = logger . child (
319- `[task ${ evalDef . repo . split ( "/" ) [ 1 ] } @${ task . commit . slice ( 0 , 7 ) } ]` ,
288+ `[prompt ${ ev . repo . split ( "/" ) [ 1 ] } @${ prompt . commit . slice ( 0 , 7 ) } ]` ,
320289 ) ;
321290
322291 try {
323292 const startedAt = Date . now ( ) ;
324- const result = await agent . definition . run ( model , task . prompt , cwd ! , {
293+ const result = await agent . definition . run ( model , prompt . prompt , cwd ! , {
325294 logger : childLogger ,
326295 } ) ;
327296 duration += Date . now ( ) - startedAt ;
@@ -337,7 +306,7 @@ async function runEpisode(
337306 const message = error instanceof Error ? error . message : String ( error ) ;
338307 throw new Error (
339308 childLogger . format (
340- `Agent run failed for planner task ${ task . commit } : ${ message } ` ,
309+ `Agent run failed for planner task ${ prompt . commit } : ${ message } ` ,
341310 ) ,
342311 ) ;
343312 }
@@ -348,7 +317,7 @@ async function runEpisode(
348317 // compare the untouched baseline against the desired target.
349318
350319 const episodeAggregation = await collectAggregationInputsForRun (
351- evalDef ,
320+ ev ,
352321 model ,
353322 cwd ,
354323 preparedScores ,
@@ -400,24 +369,33 @@ function printEvalResult(
400369
401370 if ( ! evalExport . exportData ) return ;
402371
403- const { finalScore, baseScore, variancePenalty } = evalExport . exportData ;
372+ const formatEpisode = ( final : number , base : number , penalty : number ) =>
373+ `final ${ final . toFixed ( 3 ) } (base ${ base . toFixed (
374+ 3 ,
375+ ) } - penalty ${ penalty . toFixed ( 3 ) } )`;
376+
404377 logger . log (
405378 "Episode recap:" ,
406- episodes . map (
407- ( episode , index ) =>
408- ` Episode ${ index + 1 } : final ${ episode . finalScore . toFixed (
409- 3 ,
410- ) } (base ${ episode . baseScore . toFixed (
411- 3 ,
412- ) } - penalty ${ episode . variancePenalty . toFixed ( 3 ) } ) `,
379+ ... episodes . map (
380+ ( episode , i ) =>
381+ ` Episode ${ i + 1 } : ${ formatEpisode (
382+ episode . finalScore ,
383+ episode . baseScore ,
384+ episode . variancePenalty ,
385+ ) } `,
413386 ) ,
414- `Aggregate final: ${ finalScore . toFixed ( 3 ) } (base ${ baseScore . toFixed (
415- 3 ,
416- ) } - penalty ${ variancePenalty . toFixed ( 3 ) } )`,
387+ `Aggregate final: ${ formatEpisode (
388+ evalExport . exportData . finalScore ,
389+ evalExport . exportData . baseScore ,
390+ evalExport . exportData . variancePenalty ,
391+ ) } `,
417392 ) ;
418393}
419394
420- function buildEvalChart ( evalExport : ReturnType < typeof summarizeAggregation > ) {
395+ function buildEvalChart (
396+ evalExport : ReturnType < typeof summarizeAggregation > ,
397+ logger : Logger . Instance ,
398+ ) {
421399 const chartUrl = buildRadarChartUrl ( {
422400 labels : evalExport . exportData . scores . map ( ( s ) => s . assignment . name ) ,
423401 values : evalExport . exportData . scores . map ( ( s ) =>
@@ -426,12 +404,13 @@ function buildEvalChart(evalExport: ReturnType<typeof summarizeAggregation>) {
426404 title : `${ evalExport . exportData . evaluation . repo } • ${ evalExport . exportData . model } ` ,
427405 datasetLabel : evalExport . exportData . model ,
428406 } ) ;
429- console . log ( `\nRadar Chart: ${ chartUrl } \n` ) ;
407+ logger . log ( `Radar Chart: ${ chartUrl } \n` ) ;
430408}
431409
432410function storeEvalResult (
433411 evalExport : ReturnType < typeof summarizeAggregation > ,
434- outputPath ?: string ,
412+ outputPath : string | undefined ,
413+ logger : Logger . Instance ,
435414) {
436415 if ( ! outputPath ) return ;
437416
@@ -443,9 +422,9 @@ function storeEvalResult(
443422
444423 writeFileSync ( outputPath , JSON . stringify ( evalExport . exportData , null , 2 ) ) ;
445424 } catch ( error ) {
446- const message =
447- error instanceof Error ? error . message : "Unknown error writing output." ;
448- throw new Error ( `Failed to write export to ${ outputPath } : ${ message } ` ) ;
425+ throw new Error (
426+ logger . format ( `Failed to write export to ${ outputPath } :` , error ) ,
427+ ) ;
449428 }
450429}
451430
@@ -470,7 +449,7 @@ function cleanupRepository(cwd: string, logger: Logger.Instance): void {
470449}
471450
472451async function collectAggregationInputsForRun (
473- datasetEval : DatasetEval ,
452+ ev : Eval . Instance ,
474453 model : ModelCombination ,
475454 cwd : string ,
476455 preparedReferences : Map < string , unknown > ,
@@ -479,7 +458,7 @@ async function collectAggregationInputsForRun(
479458 const aggregationInputs = new Map < string , ScoreAggregationInput > ( ) ;
480459
481460 for ( const judge of judges ) {
482- for ( const assignment of datasetEval . scores ) {
461+ for ( const assignment of ev . scores ) {
483462 const scoreDefinition = scoreRegistry [ assignment . name ] ;
484463
485464 if ( ! scoreDefinition ) {
@@ -502,7 +481,7 @@ async function collectAggregationInputsForRun(
502481 const result = await scoreDefinition . evaluate ( {
503482 judge,
504483 reference,
505- evaluation : datasetEval ,
484+ ev : ev ,
506485 cwd,
507486 config : assignment . args ,
508487 logger,
@@ -537,15 +516,15 @@ async function collectAggregationInputsForRun(
537516
538517function summarizeAggregation (
539518 agentName : string ,
540- datasetEval : DatasetEval ,
519+ ev : Eval . Instance ,
541520 model : ModelCombination ,
542521 aggregationInputs : Map < string , ScoreAggregationInput > ,
543522 episodes : Episode [ ] ,
544523 usage : Usage ,
545524 summary : string ,
546525 duration : number ,
547- ) : { lines : string [ ] ; exportData : EvaluationRunExport } {
548- const evalId = datasetEval . repo ;
526+ ) {
527+ const evalId = ev . repo ;
549528
550529 const aggregation = aggregateScores ( Array . from ( aggregationInputs . values ( ) ) ) ;
551530
@@ -591,10 +570,10 @@ function summarizeAggregation(
591570 const exportData : EvaluationRunExport = {
592571 agent : agentName ,
593572 evaluation : {
594- identifier : datasetEval . identifier ,
595- repo : datasetEval . repo ,
596- from : datasetEval . from ,
597- to : datasetEval . to ,
573+ identifier : ev . id ,
574+ repo : ev . repo ,
575+ from : ev . from ,
576+ to : ev . to ,
598577 } ,
599578 model,
600579 jobUrl : process . env . GITHUB_BENCHMARK_JOB_URL ! ,
@@ -649,12 +628,7 @@ function buildScoreExportsFromEpisodes(
649628
650629 episodes . forEach ( ( episode ) => {
651630 episode . scores . forEach ( ( score ) => {
652- const assignment : ScoreAssignment = {
653- name : score . assignment . name ,
654- weight : score . assignment . weight ,
655- args : score . assignment . args ,
656- } ;
657- const entry = ensureAggregationEntry ( aggregationInputs , assignment ) ;
631+ const entry = ensureAggregationEntry ( aggregationInputs , score . assignment ) ;
658632
659633 score . judges . forEach ( ( judgeResult ) => {
660634 const judge : Judge = {
@@ -677,7 +651,7 @@ function buildScoreExportsFromEpisodes(
677651
678652function ensureAggregationEntry (
679653 map : Map < string , ScoreAggregationInput > ,
680- assignment : ScoreAssignment ,
654+ assignment : Eval . Instance [ "scores" ] [ number ] ,
681655) : ScoreAggregationInput {
682656 if ( ! map . has ( assignment . name ) ) {
683657 map . set ( assignment . name , {
0 commit comments