Skip to content

Commit d9b552b

Browse files
committed
feat: add tokens per second metric
1 parent 9a3d9be commit d9b552b

3 files changed

Lines changed: 36 additions & 2 deletions

File tree

cli.ts

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,11 +295,13 @@ async function main(): Promise<void> {
295295
logs: string[];
296296
actions: string[];
297297
usage: Usage;
298+
durationMs: number;
298299
}
299300

300301
const runEpisode = async (
301302
episodeIndex: number,
302303
): Promise<EpisodeResult> => {
304+
const episodeStartTime = Date.now();
303305
const episodeTag = `[episode ${episodeIndex}/${EPISODES}]`;
304306
const baselineCommit = evalDefinition.from;
305307
const prefix = `${episodeTag} [${combinationLabel}]`;
@@ -456,6 +458,7 @@ async function main(): Promise<void> {
456458
logs: [],
457459
actions: episodeActions,
458460
usage,
461+
durationMs: Date.now() - episodeStartTime,
459462
};
460463
} finally {
461464
if (cwd) {
@@ -513,6 +516,16 @@ async function main(): Promise<void> {
513516
{ input: 0, output: 0 },
514517
);
515518

519+
// Calculate total duration and tokens per second
520+
const totalDurationMs = episodeResults.reduce(
521+
(sum, result) => sum + result.durationMs,
522+
0,
523+
);
524+
const totalTokens = averageUsage.input + averageUsage.output;
525+
const durationSeconds = totalDurationMs / 1000;
526+
const tokensPerSecond =
527+
durationSeconds > 0 ? totalTokens / durationSeconds : 0;
528+
516529
for (const result of episodeResults) {
517530
mergeAggregationInputs(aggregatedInputs, result.aggregation);
518531
episodeExports.push({
@@ -560,6 +573,8 @@ async function main(): Promise<void> {
560573
episodeExports,
561574
averageUsage,
562575
summary,
576+
totalDurationMs,
577+
tokensPerSecond,
563578
);
564579
};
565580

@@ -578,8 +593,14 @@ async function main(): Promise<void> {
578593
});
579594

580595
if (evaluationResult.exportData) {
581-
const { episodes, finalScore, baseScore, variancePenalty } =
582-
evaluationResult.exportData;
596+
const {
597+
episodes,
598+
finalScore,
599+
baseScore,
600+
variancePenalty,
601+
durationMs,
602+
tokensPerSecond,
603+
} = evaluationResult.exportData;
583604
if (episodes.length > 0) {
584605
console.log("[debug] Episode recap:");
585606
episodes.forEach((episode, index) => {
@@ -599,6 +620,9 @@ async function main(): Promise<void> {
599620
3,
600621
)})`,
601622
);
623+
console.log(
624+
`[debug] Performance: ${(durationMs / 1000).toFixed(1)}s, ${tokensPerSecond.toFixed(1)} tokens/sec`,
625+
);
602626

603627
// Generate and log radar chart URL
604628
const chartUrl = buildRadarChartUrl({
@@ -780,6 +804,8 @@ function summarizeAggregation(
780804
episodes: Episode[],
781805
usage: Usage,
782806
summary: string,
807+
durationMs: number,
808+
tokensPerSecond: number,
783809
): { lines: string[]; exportData: EvaluationRunExport } {
784810
const evalId = datasetEval.repo;
785811
const runContext = contextLabel ? `${evalId} [${contextLabel}]` : evalId;
@@ -842,6 +868,8 @@ function summarizeAggregation(
842868
episodes,
843869
usage,
844870
summary,
871+
durationMs,
872+
tokensPerSecond,
845873
};
846874

847875
return { lines, exportData };

scripts/discord-sample.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ const sampleExport: EvaluationRunExport[] = [
231231
episodes: claudeEpisodes,
232232
usage: { input: 50000, output: 10100 },
233233
summary: "",
234+
durationMs: 180000,
235+
tokensPerSecond: 334.0,
234236
},
235237
{
236238
agent: "opencode",
@@ -250,6 +252,8 @@ const sampleExport: EvaluationRunExport[] = [
250252
episodes: gptEpisodes,
251253
usage: { input: 48167, output: 9633 },
252254
summary: "",
255+
durationMs: 165000,
256+
tokensPerSecond: 350.9,
253257
},
254258
];
255259

types/export.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,6 @@ export interface EvaluationRunExport {
5454
episodes: Episode[];
5555
usage: Usage;
5656
summary: string;
57+
durationMs: number;
58+
tokensPerSecond: number;
5759
}

0 commit comments

Comments
 (0)