@@ -295,11 +295,13 @@ async function main(): Promise<void> {
295295 logs : string [ ] ;
296296 actions : string [ ] ;
297297 usage : Usage ;
298+ durationMs : number ;
298299 }
299300
300301 const runEpisode = async (
301302 episodeIndex : number ,
302303 ) : Promise < EpisodeResult > => {
304+ const episodeStartTime = Date . now ( ) ;
303305 const episodeTag = `[episode ${ episodeIndex } /${ EPISODES } ]` ;
304306 const baselineCommit = evalDefinition . from ;
305307 const prefix = `${ episodeTag } [${ combinationLabel } ]` ;
@@ -456,6 +458,7 @@ async function main(): Promise<void> {
456458 logs : [ ] ,
457459 actions : episodeActions ,
458460 usage,
461+ durationMs : Date . now ( ) - episodeStartTime ,
459462 } ;
460463 } finally {
461464 if ( cwd ) {
@@ -513,6 +516,16 @@ async function main(): Promise<void> {
513516 { input : 0 , output : 0 } ,
514517 ) ;
515518
519+ // Calculate total duration and tokens per second
520+ const totalDurationMs = episodeResults . reduce (
521+ ( sum , result ) => sum + result . durationMs ,
522+ 0 ,
523+ ) ;
524+ const totalTokens = averageUsage . input + averageUsage . output ;
525+ const durationSeconds = totalDurationMs / 1000 ;
526+ const tokensPerSecond =
527+ durationSeconds > 0 ? totalTokens / durationSeconds : 0 ;
528+
516529 for ( const result of episodeResults ) {
517530 mergeAggregationInputs ( aggregatedInputs , result . aggregation ) ;
518531 episodeExports . push ( {
@@ -560,6 +573,8 @@ async function main(): Promise<void> {
560573 episodeExports ,
561574 averageUsage ,
562575 summary ,
576+ totalDurationMs ,
577+ tokensPerSecond ,
563578 ) ;
564579 } ;
565580
@@ -578,8 +593,14 @@ async function main(): Promise<void> {
578593 } ) ;
579594
580595 if ( evaluationResult . exportData ) {
581- const { episodes, finalScore, baseScore, variancePenalty } =
582- evaluationResult . exportData ;
596+ const {
597+ episodes,
598+ finalScore,
599+ baseScore,
600+ variancePenalty,
601+ durationMs,
602+ tokensPerSecond,
603+ } = evaluationResult . exportData ;
583604 if ( episodes . length > 0 ) {
584605 console . log ( "[debug] Episode recap:" ) ;
585606 episodes . forEach ( ( episode , index ) => {
@@ -599,6 +620,9 @@ async function main(): Promise<void> {
599620 3 ,
600621 ) } )`,
601622 ) ;
623+ console . log (
624+ `[debug] Performance: ${ ( durationMs / 1000 ) . toFixed ( 1 ) } s, ${ tokensPerSecond . toFixed ( 1 ) } tokens/sec` ,
625+ ) ;
602626
603627 // Generate and log radar chart URL
604628 const chartUrl = buildRadarChartUrl ( {
@@ -780,6 +804,8 @@ function summarizeAggregation(
780804 episodes : Episode [ ] ,
781805 usage : Usage ,
782806 summary : string ,
807+ durationMs : number ,
808+ tokensPerSecond : number ,
783809) : { lines : string [ ] ; exportData : EvaluationRunExport } {
784810 const evalId = datasetEval . repo ;
785811 const runContext = contextLabel ? `${ evalId } [${ contextLabel } ]` : evalId ;
@@ -842,6 +868,8 @@ function summarizeAggregation(
842868 episodes,
843869 usage,
844870 summary,
871+ durationMs,
872+ tokensPerSecond,
845873 } ;
846874
847875 return { lines, exportData } ;
0 commit comments