anomalyco
diff --git a/‎.github/workflows/publish-benchmark.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/publish-benchmark.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 5 deletions b/‎README.md‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎agents/opencode.ts‎
Lines changed: 4 additions & 0 deletions b/‎agents/opencode.ts‎
Lines changed: 4 additions & 0 deletions
@@ -45,7 +45,7 @@ jobs:
       - id: matrix
         name: Build benchmark matrix
         run: |
-          bun add -g opencode-ai @openai/codex-sdk
+          bun add -g opencode-ai@dev @openai/codex-sdk
           set -euo pipefail
           MATRIX_JSON="$(bun run scripts/generate-benchmark-matrix.ts)"
           printf 'matrix=%s\n' "${MATRIX_JSON}" >> "$GITHUB_OUTPUT"
@@ -76,7 +76,7 @@ jobs:
         run: bun install --frozen-lockfile
 
       - name: Install OpenCode CLI
-        run: bun add -g opencode-ai @openai/codex-sdk
+        run: bun add -g opencode-ai@dev @openai/codex-sdk
 
       - name: Determine benchmark job URL
         id: job_url
 
@@ -3,12 +3,11 @@
 A benchmarking framework for evaluating opencode's AI coding agents across real-world GitHub repositories. The framework runs agents against target repositories and scores their outputs using multiple LLM judges, measuring code quality across dimensions like readability, functionality, adherence to best practices, and efficiency.
 
 ```bash
-orvl opencode # run opencode on all models x evals x scores
-orvl opencode --model opencode/qwen3-coder # filter by model across all evals x scores
-orvl opencode --eval noworneverev/graphrag-visualizer # filter by eval across models x scores
+orvl opencode --model opencode/gpt-5-codex --eval noworneverev/graphrag-visualizer
+orvl opencode --model opencode/claude-sonnet-4-5 --eval prismicio-community/course-fizzi-next --output results.json
 ```
 
-Filters use CLI options like `--model`, `--eval`, and `--score`.
+Both `--model` and `--eval` are required; the CLI now runs a single agent/model/eval pairing at a time. Each invocation executes three isolated `[episode X/3]` runs (fresh clones) and aggregates the judge scores before exporting results.
 
 ## Setup
 ```bash
@@ -19,7 +18,7 @@ bun run build
 During development the CLI can be executed directly with Bun:
 
 ```bash
-bun run dev -- <agent> [--model <model>] [--eval <owner/name>] [--score <score>]
+bun run dev -- <agent> --model <model> --eval <owner/name>
 ```
 
 ## Continuous Releases
 
@@ -110,8 +110,12 @@ function serializeError(error: unknown): Record<string, unknown> {
       name: error.name,
       message: error.message,
       stack: error.stack,
+      cause: error.cause ? serializeError(error.cause) : undefined,
     };
   }
+  if (typeof error === "object" && error !== null) {
+    return { ...error };
+  }
   return { value: String(error) };
 }
Original file line number	Diff line number	Diff line change
`@@ -110,8 +110,12 @@ function serializeError(error: unknown): Record<string, unknown> {`
`110`	`110`	`name: error.name,`
`111`	`111`	`message: error.message,`
`112`	`112`	`stack: error.stack,`
	`113`	`+ cause: error.cause ? serializeError(error.cause) : undefined,`
`113`	`114`	`};`
`114`	`115`	`}`
	`116`	`+ if (typeof error === "object" && error !== null) {`
	`117`	`+ return { ...error };`
	`118`	`+ }`
`115`	`119`	`return { value: String(error) };`
`116`	`120`	`}`
`117`	`121`