things in a better shape

Aslemammad · Aslemammad · commit 54643ba4d4e5 · 2025-10-31T21:30:41.000+03:30
diff --git a/.github/workflows/benchmark-reusable.yml b/.github/workflows/benchmark-reusable.yml
@@ -140,6 +140,18 @@ jobs:
               exit 1
             fi
 
+      - name: Log benchmark summary
+        if: always()
+        run: |
+          set -euo pipefail
+          if [ -f benchmark.json ]; then
+            echo "=== Benchmark Summary ==="
+            jq -r '.summary // "No summary available"' benchmark.json
+            echo "========================"
+          else
+            echo "benchmark.json not found, skipping summary log"
+          fi
+
       - name: Prepare artifact name
         id: artifact
         env:
diff --git a/agents/claude-code.ts b/agents/claude-code.ts
@@ -106,6 +106,7 @@ const claudeCodeAgent: AgentDefinition = {
     const cacheKey = sessionKey(cwd, model);
     const existingSessionID = sessionCache.get(cacheKey);
 
+    const actions: string[] = [];
     const usage = {
       input: 0,
       output: 0,
@@ -128,12 +129,13 @@ const claudeCodeAgent: AgentDefinition = {
         // Extract and cache session ID from messages
         sessionCache.set(cacheKey, message.session_id);
 
-        // Accumulate token usage if available
-        if (message.usage) {
+        // Accumulate token usage if available (only SDKResultMessage has usage)
+        if (message.type === "result" && "usage" in message) {
           usage.input += message.usage.input_tokens || 0;
           usage.output += message.usage.output_tokens || 0;
         }
 
+        actions.push(JSON.stringify(message));
         logJson(message, options);
       }
     } catch (error) {
@@ -149,7 +151,7 @@ const claudeCodeAgent: AgentDefinition = {
       throw error;
     }
 
-    return { command: displayCommand, usage };
+    return { command: displayCommand, actions, usage };
   },
 };
 
diff --git a/agents/opencode.ts b/agents/opencode.ts
@@ -170,6 +170,7 @@ const opencodeAgent: AgentDefinition = {
       sessionCache.set(cacheKey, sessionID);
     }
 
+    const actions: string[] = [];
     const usage = {
       input: 0,
       output: 0,
@@ -192,6 +193,11 @@ const opencodeAgent: AgentDefinition = {
       usage.input = data.info.tokens.input;
       usage.output = data.info.tokens.output;
 
+      actions.push(JSON.stringify(data.info));
+      if (Array.isArray(data.parts)) {
+        data.parts.forEach((part) => actions.push(JSON.stringify(part)));
+      }
+
       logPromptResult(data, options);
     } catch (error) {
       sessionCache.delete(cacheKey);
@@ -205,7 +211,7 @@ const opencodeAgent: AgentDefinition = {
       throw error;
     }
 
-    return { command: displayCommand, usage };
+    return { command: displayCommand, actions, usage };
   },
 };
 
diff --git a/cli.ts b/cli.ts
@@ -16,6 +16,10 @@ import { listScores, scores as scoreRegistry } from "~/scores/index.js";
 import { dataset } from "~/lib/dataset.js";
 import type { DatasetEval, ScoreAssignment } from "~/lib/dataset.js";
 import { generatePlannerTasks, type PlannerTask } from "~/lib/planner.js";
+import {
+  generateActionsSummary,
+  type EpisodeActions,
+} from "~/lib/summarizer.js";
 import { fetchPlannerCommitDiffs } from "~/lib/github.js";
 import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";
 import { judges, getJudgeModelId } from "~/judges.js";
@@ -31,7 +35,6 @@ import type {
 import type {
   Episode,
   EvaluationRunExport,
-  TokenUsage,
   Usage,
 } from "~/types/export.js";
 import { withRetries, withTimeout } from "~/lib/utils/retry.js";
@@ -257,6 +260,7 @@ async function main(): Promise<void> {
         aggregationSummary: AggregationSummary;
         scoreExports: EvaluationRunExport["scores"];
         logs: string[];
+        actions: string[];
         usage: Usage;
       }
 
@@ -307,6 +311,7 @@ async function main(): Promise<void> {
           let tasksExecuted = 0;
 
           let usage: Usage = { input: 0, output: 0 };
+          const episodeActions: string[] = [];
 
           for (const task of plannerTasks) {
             const logPrefix = `${prefix} ${task.commit}`;
@@ -349,6 +354,9 @@ async function main(): Promise<void> {
 
               usage.input += result.usage.input;
               usage.output += result.usage.output;
+
+              // Collect actions from this task
+              episodeActions.push(...result.actions);
             } catch (error) {
               const message =
                 error instanceof Error ? error.message : String(error);
@@ -405,6 +413,7 @@ async function main(): Promise<void> {
             aggregationSummary,
             scoreExports: episodeScoreExports,
             logs: [],
+            actions: episodeActions,
             usage,
           };
         } finally {
@@ -450,6 +459,7 @@ async function main(): Promise<void> {
       const aggregatedInputs = new Map<string, ScoreAggregationInput>();
       const episodeExports: Episode[] = [];
       const allLogs: string[] = [];
+      const episodesActions: EpisodeActions[] = [];
       const averageUsage = episodeResults.reduce(
         (prev, { usage }) => ({
           input: prev.input + usage.input / episodeResults.length,
@@ -472,6 +482,28 @@ async function main(): Promise<void> {
         if (result.logs && result.logs.length > 0) {
           allLogs.push(...result.logs);
         }
+
+        // Collect actions for summarization
+        episodesActions.push({
+          episodeIndex: result.index,
+          actions: result.actions,
+        });
+      }
+
+      // Generate summary from all episodes' actions
+      let summary = "";
+      try {
+        summary = await generateActionsSummary(
+          evalDefinition,
+          model,
+          episodesActions,
+        );
+      } catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        console.error(
+          `[${combinationLabel}] Failed to generate summary: ${message}`,
+        );
+        summary = ""; // Keep empty string on failure
       }
 
       return summarizeAggregation(
@@ -482,7 +514,6 @@ async function main(): Promise<void> {
         aggregatedInputs,
         episodeExports,
         averageUsage,
-        "",
         summary,
       );
     };
@@ -497,7 +528,7 @@ async function main(): Promise<void> {
       agentName,
     );
 
-    evaluationResult.summaries.forEach((line) => {
+    evaluationResult.lines.forEach((line) => {
       console.log(line);
     });
 
diff --git a/index.ts b/index.ts
@@ -1,7 +1,6 @@
 export { getAgent, listAgents } from "~/agents/index.js";
 export { scores, getScore, listScores } from "~/scores/index.js";
 export type { AgentDefinition, AgentExecutor, AgentPrompt } from "~/lib/createAgent.js";
-export { createAgent } from "~/lib/createAgent.js";
 export { createScore } from "~/lib/createScore.js";
 export type {
   ScoreDefinition,
diff --git a/lib/createAgent.ts b/lib/createAgent.ts
@@ -24,7 +24,7 @@ export interface AgentDefinition {
 
 export interface AgentRunResult {
   command: string;
-  messages: string[];
+  actions: string[];
   usage: {
     input: number;
     output: number;
diff --git a/lib/summarizer.ts b/lib/summarizer.ts
@@ -0,0 +1,97 @@
+import { generateText } from "ai";
+import { z } from "zod";
+
+import type { DatasetEval } from "~/lib/dataset.js";
+import { getZenLanguageModel } from "~/lib/zenModels.js";
+
+const fallback = (envName: string, defaultValue: string): string =>
+  process.env[envName]?.trim() || defaultValue;
+
+export interface EpisodeActions {
+  episodeIndex: number;
+  actions: string[];
+}
+
+const systemPrompt = `You are a technical summarizer that creates concise, informative summaries of autonomous agent activities across multiple evaluation episodes.
+
+Your task:
+- Analyze the actions taken by an AI agent across 3 separate episodes of the same task
+- Identify common patterns, tools used, files modified, and key behaviors
+- Produce a clear, structured summary that highlights what the agent did
+
+Focus on:
+- **Tool usage patterns**: Which tools were used most frequently
+- **File modifications**: Which files were created, edited, or read
+- **Common strategies**: What approach did the agent consistently take
+- **Consistency**: Did the agent behave similarly across episodes, or vary significantly?
+- **Outcomes**: Any errors, successes, or notable behaviors
+
+Output format:
+Write 2-4 paragraphs in a professional, technical style. Be concise but informative.
+
+Structure:
+1. **Overview**: Brief description of what the agent accomplished
+2. **Approach**: Tools and strategies used consistently across episodes
+3. **Key actions**: Specific files modified or critical operations performed
+4. **Observations**: Any notable patterns, inconsistencies, or issues
+
+Guidelines:
+- Keep it under 300 words
+- Use technical language but be clear
+- Focus on patterns across episodes, not individual actions
+- Mention specific tool names and file paths when relevant
+- Note any errors or issues encountered
+- Be objective and descriptive, not evaluative`;
+
+const summarizerModelId = fallback(
+  "SUMMARIZER_MODEL",
+  "opencode/claude-sonnet-4-5",
+);
+
+export async function generateActionsSummary(
+  evaluation: DatasetEval,
+  model: string,
+  episodesActions: EpisodeActions[],
+): Promise<string> {
+  if (episodesActions.length === 0) {
+    return "No actions recorded";
+  }
+
+  // Build a structured prompt with the actions data
+  const episodesSummary = episodesActions
+    .map((ep) => {
+      const sample = ep.actions.slice(0, 50); // First 50 actions per episode
+      const truncated =
+        ep.actions.length > 50
+          ? `\n... (${ep.actions.length - 50} more actions)`
+          : "";
+
+      return `### Episode ${ep.episodeIndex}
+Actions (${ep.actions.length} total):
+${sample.join("\n")}${truncated}`;
+    })
+    .join("\n\n");
+
+  const prompt = `Repository: ${evaluation.repo}
+Model: ${model}
+Task: Implement changes from ${evaluation.from.slice(0, 7)} to ${evaluation.to.slice(0, 7)}
+
+${episodesSummary}
+
+Provide a concise summary of what the agent did across these episodes.`;
+
+  try {
+    const result = await generateText({
+      model: getZenLanguageModel(summarizerModelId),
+      system: systemPrompt,
+      temperature: 0.3,
+      prompt,
+    });
+
+    return result.text.trim();
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    console.error(`[summarizer] Failed to generate summary: ${message}`);
+    return `Unable to generate summary: ${message}`;
+  }
+}
diff --git a/scripts/discord-sample.ts b/scripts/discord-sample.ts
@@ -126,18 +126,21 @@ const claudeEpisodes: Episode[] = [
     baseScore: 0.912,
     variancePenalty: 0.003,
     scores: cloneScores(claudeScores, 0.002, -0.005),
+    usage: { input: 50000, output: 10000 },
   },
   {
     finalScore: 0.901,
     baseScore: 0.905,
     variancePenalty: 0.004,
     scores: cloneScores(claudeScores, 0, 0),
+    usage: { input: 51000, output: 10500 },
   },
   {
     finalScore: 0.896,
     baseScore: 0.902,
     variancePenalty: 0.006,
     scores: cloneScores(claudeScores, -0.002, 0.004),
+    usage: { input: 49000, output: 9800 },
   },
 ];
 
@@ -183,18 +186,21 @@ const gptEpisodes: Episode[] = [
     baseScore: 0.907,
     variancePenalty: 0.004,
     scores: cloneScores(gptScores, 0.003, -0.006),
+    usage: { input: 48000, output: 9500 },
   },
   {
     finalScore: 0.894,
     baseScore: 0.898,
     variancePenalty: 0.004,
     scores: cloneScores(gptScores, -0.002, 0.002),
+    usage: { input: 49000, output: 9800 },
   },
   {
     finalScore: 0.892,
     baseScore: 0.897,
     variancePenalty: 0.005,
     scores: cloneScores(gptScores, -0.003, 0.004),
+    usage: { input: 47500, output: 9400 },
   },
 ];
 
@@ -214,6 +220,8 @@ const sampleExport: EvaluationRunExport[] = [
     variancePenalty: 0.003,
     scores: claudeScores,
     episodes: claudeEpisodes,
+    usage: { input: 50000, output: 10100 },
+    summary: "",
   },
   {
     agent: "opencode",
@@ -230,6 +238,8 @@ const sampleExport: EvaluationRunExport[] = [
     variancePenalty: 0.004,
     scores: gptScores,
     episodes: gptEpisodes,
+    usage: { input: 48167, output: 9633 },
+    summary: "",
   },
 ];