wip

Aslemammad · Aslemammad · commit 605f704bad5d · 2025-11-02T20:43:16.000+03:30
diff --git a/.github/workflows/benchmark-reusable.yml b/.github/workflows/benchmark-reusable.yml
@@ -17,6 +17,30 @@ permissions:
   actions: read
 
 jobs:
+  prepare-analysis:
+    name: Prepare Judge Analysis Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      evals: ${{ steps.compute.outputs.evals }}
+    steps:
+      - name: Extract unique evaluations
+        id: compute
+        run: |
+          set -euo pipefail
+          matrix_json=$(cat <<'JSON'
+${{ inputs.matrix }}
+JSON
+)
+          evals=$(jq -c 'map(.eval) | unique' <<<"$matrix_json")
+
+          if [ -z "${evals}" ] || [ "${evals}" = "null" ]; then
+            echo "No evaluations found in matrix definition." >&2
+            evals="[]"
+          fi
+
+          echo "Analysis eval matrix: ${evals}"
+          echo "evals=${evals}" >> "$GITHUB_OUTPUT"
+
   benchmark:
     name: Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
     runs-on: ubuntu-latest
@@ -148,10 +172,8 @@ jobs:
             echo "benchmark.json not found, skipping summary log"
           fi
 
-      - name: Generate Judges Summary for this Evaluation
+      - name: Run benchmark-level judge analysis
         env:
-          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           BENCHMARK_EVAL: ${{ matrix.eval }}
         run: |
           set -euo pipefail
@@ -161,7 +183,7 @@ jobs:
           echo "Evaluation: ${BENCHMARK_EVAL}"
           echo "═══════════════════════════════════════════════════════"
           echo ""
-          bun run scripts/judges-summary.ts benchmark.json --ai-summary || true
+          bun run scripts/analysis.ts benchmark.json || true
           echo ""
 
       - name: Prepare artifact name
@@ -183,6 +205,55 @@ jobs:
           name: ${{ steps.artifact.outputs.name }}
           path: benchmark.json
 
+  eval-analysis:
+    name: Judge Analysis - ${{ matrix.eval }}
+    runs-on: ubuntu-latest
+    needs:
+      - benchmark
+      - prepare-analysis
+    if: needs.prepare-analysis.outputs.evals != '[]'
+    environment: production
+    strategy:
+      fail-fast: false
+      matrix:
+        eval: ${{ fromJSON(needs.prepare-analysis.outputs.evals) }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Download benchmark artifacts for eval
+        uses: actions/download-artifact@v4
+        with:
+          path: eval-benchmarks
+          pattern: benchmark-*-*-${{ replace(matrix.eval, '/', '-') }}
+
+      - name: Merge benchmark exports
+        id: merge
+        run: |
+          set -euo pipefail
+
+          if [ ! -d eval-benchmarks ] || ! find eval-benchmarks -type f -name '*.json' -print -quit | grep -q .; then
+            echo "No benchmark artifacts found for eval ${{ matrix.eval }}; skipping analysis."
+            echo "has_data=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          bun run scripts/merge-benchmark-exports.ts eval-benchmarks merged-benchmark.json
+          echo "Merged benchmark export ready for analysis."
+          echo "has_data=true" >> "$GITHUB_OUTPUT"
+
+      - name: Run judges analysis
+        if: steps.merge.outputs.has_data == 'true'
+        run: bun run scripts/analysis.ts merged-benchmark.json
+
   notify:
     runs-on: ubuntu-latest
     needs: benchmark
diff --git a/scripts/analysis.ts b/scripts/analysis.ts
@@ -0,0 +1,175 @@
+#!/usr/bin/env bun
+/**
+ * Agent Analysis Script
+ *
+ * Compares agent/model performance for one or more benchmark exports.
+ * Accepts either a single benchmark run or a merged export containing
+ * multiple runs of the same evaluation across different agents/models.
+ *
+ * Usage:
+ *   bun run scripts/analysis.ts path/to/benchmark.json
+ *   bun run scripts/analysis.ts path/to/merged.json
+ */
+
+import { readFileSync } from "node:fs";
+import process from "node:process";
+import { generateText } from "ai";
+import type { EvaluationRunExport } from "~/types/export.js";
+import { getZenLanguageModel } from "~/lib/zenModels.js";
+
+export const AGENT_ANALYSIS_PROMPT = `You are an expert analyst reviewing how different agents and models perform on the same benchmark evaluation.
+
+Your task is to analyze the benchmark data and identify:
+1. **Systematic patterns**: Are certain agents or models consistently leading or lagging?
+2. **Performance gaps**: Where are the largest score deltas, and what might explain them?
+3. **Agent tendencies**: Do some runs prioritize certain behaviors (e.g., safety, completeness) based on their summaries?
+4. **Notable insights**: Highlight interesting contrasts between the strongest and weakest runs.
+5. **Recommendations**: Suggest concrete adjustments or experiments to improve future runs.
+
+Focus on concrete observations from the data provided. Look for patterns such as:
+- Consistent scoring differences between specific agents or models
+- Summaries that reveal different optimization strategies or failure modes
+- Runs that score well overall but exhibit weaknesses in their own write-ups
+
+Provide a concise, insightful analysis that helps developers understand agent behavior and improve the evaluation system.`;
+
+const fallback = (envName: string, defaultValue: string): string =>
+  process.env[envName]?.trim() || defaultValue;
+
+const analyzerModelId = fallback(
+  "ANALYZER_MODEL",
+  "opencode/claude-sonnet-4-5",
+);
+
+function buildDynamicContext(runs: EvaluationRunExport[]): string {
+  const repo = runs[0].evaluation.repo;
+  const parts: string[] = [];
+
+  parts.push(`# Evaluation
+- Repository: ${repo}
+- Total runs: ${runs.length}
+`);
+
+  parts.push("# Run Scoreboard");
+  runs.forEach((run, index) => {
+    parts.push(
+      `${index + 1}. ${run.agent} (${run.model}) — final ${run.finalScore.toFixed(3)}, base ${run.baseScore.toFixed(
+        3,
+      )}, penalty ${run.variancePenalty.toFixed(3)}`,
+    );
+    if (run.summary?.trim()) {
+      parts.push(`   Summary: ${run.summary.trim()}`);
+    }
+  });
+  parts.push("");
+
+  return parts.join("\n");
+}
+
+function formatFallbackSummary(runs: EvaluationRunExport[]): string {
+  const repo = runs[0].evaluation.repo;
+  const lines: string[] = [];
+
+  lines.push(`Evaluation: ${repo}`);
+  lines.push("");
+  lines.push("Runs (best to worst):");
+
+  runs.forEach((run, index) => {
+    lines.push(
+      `${index + 1}. ${run.agent} (${run.model}) – final ${run.finalScore.toFixed(
+        3,
+      )}, base ${run.baseScore.toFixed(3)}, penalty ${run.variancePenalty.toFixed(
+        3,
+      )}`,
+    );
+
+    if (run.summary?.trim()) {
+      lines.push(`   summary: ${run.summary.replace(/\s+/g, " ").trim()}`);
+    }
+  });
+
+  if (runs.length > 1) {
+    const leader = runs[0];
+    const trailer = runs[runs.length - 1];
+    const gap = leader.finalScore - trailer.finalScore;
+    lines.push("");
+    lines.push(`Score gap (top vs bottom): ${gap.toFixed(3)}`);
+  }
+
+  return lines.join("\n").trimEnd();
+}
+
+async function generateAnalysis(
+  runs: EvaluationRunExport[],
+): Promise<string> {
+  const context = buildDynamicContext(runs);
+
+  try {
+    const { text } = await generateText({
+      model: getZenLanguageModel(analyzerModelId),
+      system: AGENT_ANALYSIS_PROMPT,
+      prompt: context,
+      temperature: 0.3,
+    });
+    return text.trim();
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    return `Failed to generate AI analysis (${message}).\n\n${formatFallbackSummary(
+      runs,
+    )}`;
+  }
+}
+
+function usage(): void {
+  console.error("Usage: bun run scripts/analysis.ts <benchmark-file.json>");
+  console.error("");
+  console.error(
+    "Generates an AI-powered comparison of agent/model performance.",
+  );
+}
+
+async function main(): Promise<void> {
+  const args = process.argv.slice(2);
+
+  if (args.length === 0) {
+    usage();
+    process.exit(1);
+  }
+
+  const filePath = args[0];
+  let parsed: unknown;
+
+  try {
+    const fileContent = readFileSync(filePath, "utf-8");
+    parsed = JSON.parse(fileContent) as unknown;
+  } catch (error) {
+    console.error(`Error reading file ${filePath}:`, error);
+    process.exit(1);
+  }
+
+  const runs = (Array.isArray(parsed)
+    ? (parsed as EvaluationRunExport[])
+    : [parsed as EvaluationRunExport]).filter(
+    (run): run is EvaluationRunExport =>
+      run != null && typeof run === "object" && "finalScore" in run,
+  );
+
+  if (runs.length === 0) {
+    console.error("No evaluation runs found in the provided file.");
+    process.exit(1);
+  }
+
+  const orderedRuns = [...runs].sort(
+    (a, b) => b.finalScore - a.finalScore,
+  );
+
+  const output = await generateAnalysis(orderedRuns);
+  process.stdout.write(`${output.trimEnd()}\n`);
+}
+
+if (import.meta.main) {
+  main().catch((error) => {
+    console.error("Error:", error);
+    process.exit(1);
+  });
+}
diff --git a/scripts/judges-summary.ts b/scripts/judges-summary.ts