anomalyco
diff --git a/‎.github/workflows/benchmark-reusable.yml‎
Lines changed: 197 additions & 0 deletions b/‎.github/workflows/benchmark-reusable.yml‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎.github/workflows/compare-models.yml‎
Lines changed: 63 additions & 0 deletions b/‎.github/workflows/compare-models.yml‎
Lines changed: 63 additions & 0 deletions
@@ -0,0 +1,197 @@
+name: Reusable Benchmark Workflow
+
+on:
+  workflow_call:
+    inputs:
+      matrix:
+        description: 'JSON matrix with agent/model/eval combinations'
+        required: true
+        type: string
+      package_urls:
+        description: 'Package URLs from publish step'
+        required: true
+        type: string
+
+permissions:
+  contents: read
+  actions: read
+
+jobs:
+  benchmark:
+    name: Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      actions: read
+    environment: production
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(inputs.matrix) }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Install OpenCode CLI
+        run: bun add -g opencode-ai@dev @openai/codex-sdk
+
+      - name: Determine benchmark job URL
+        id: job_url
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          BENCHMARK_AGENT: ${{ matrix.agent }}
+          BENCHMARK_MODEL: ${{ matrix.model }}
+          BENCHMARK_EVAL: ${{ matrix.eval }}
+        run: |
+          set -euo pipefail
+          # When using reusable workflows, job names get prefixed, so we search for jobs containing our pattern
+          job_pattern="Benchmark ${BENCHMARK_AGENT} / ${BENCHMARK_MODEL} / ${BENCHMARK_EVAL}"
+          jobs_endpoint="https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100"
+          job_json="$(curl -fsSL \
+            -H "Authorization: token ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github+json" \
+            "${jobs_endpoint}")"
+
+          # Try to find job by exact name match first, then by pattern match
+          job_info="$(printf '%s\n' "${job_json}" \
+            | jq -r --arg pattern "$job_pattern" 'select(type=="object" and has("jobs")) | .jobs[] | select(.name | contains($pattern)) | select(.status == "in_progress") | [.id, .html_url] | @tsv' \
+            | head -n 1)"
+
+          if [ -z "${job_info}" ] || [ "${job_info}" = "null" ]; then
+            echo "Failed to determine job info for pattern: ${job_pattern}" >&2
+            echo "Available jobs:" >&2
+            printf '%s\n' "${job_json}" | jq -r '.jobs[]?.name' >&2 || true
+            exit 1
+          fi
+
+          IFS=$'\t' read -r job_id job_url <<<"${job_info}"
+
+          if [ -z "${job_id}" ] || [ "${job_id}" = "null" ]; then
+            echo "Failed to determine job ID for pattern: ${job_pattern}" >&2
+            exit 1
+          fi
+
+          if [ -z "${job_url}" ] || [ "${job_url}" = "null" ]; then
+            echo "Failed to determine job URL for pattern: ${job_pattern}" >&2
+            exit 1
+          fi
+
+          echo "Job ID: ${job_id}"
+          echo "Job URL: ${job_url}"
+          echo "GITHUB_BENCHMARK_JOB_URL=${job_url}" >> "$GITHUB_ENV"
+          echo "url=${job_url}" >> "$GITHUB_OUTPUT"
+
+      - name: Run openreval benchmark
+        uses: nick-fields/retry@v2
+        env:
+          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
+          CODEX_API_KEY: ${{ secrets.CODEX_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_BENCHMARK_JOB_URL: ${{ steps.job_url.outputs.url }}
+          URLS: ${{ inputs.package_urls }}
+          BENCHMARK_EVAL: ${{ matrix.eval }}
+          BENCHMARK_MODEL: ${{ matrix.model }}
+          BENCHMARK_AGENT: ${{ matrix.agent }}
+        with:
+          max_attempts: 3
+          timeout_minutes: 90
+          retry_on: error
+          command: |
+            set -euo pipefail
+            PACKAGE_URL="$(printf '%s\n' "${URLS}" | awk 'NF {print $1; exit}')"
+
+            if [ -z "${BENCHMARK_EVAL}" ]; then
+              echo "Matrix entry missing evaluation identifier." >&2
+              exit 1
+            fi
+
+            if [ -z "${BENCHMARK_MODEL}" ]; then
+              echo "Matrix entry missing model identifier." >&2
+              exit 1
+            fi
+
+            if [ -z "${BENCHMARK_AGENT}" ]; then
+              echo "Matrix entry missing agent identifier." >&2
+              exit 1
+            fi
+
+            if [ -z "${PACKAGE_URL}" ]; then
+              echo "No package URL found in publish outputs." >&2
+              exit 1
+            fi
+
+            PACKAGE_SPEC="orvl@${PACKAGE_URL}"
+            OUTPUT_FILE="benchmark.json"
+            COMMAND="bunx \"${PACKAGE_SPEC}\" \"${BENCHMARK_AGENT}\" --eval \"${BENCHMARK_EVAL}\" --model \"${BENCHMARK_MODEL}\" --output \"${OUTPUT_FILE}\""
+
+            echo "Executing: ${COMMAND}"
+            if ! bunx "${PACKAGE_SPEC}" "${BENCHMARK_AGENT}" --eval "${BENCHMARK_EVAL}" --model "${BENCHMARK_MODEL}" --output "${OUTPUT_FILE}"; then
+              echo "openreval benchmark failed, dumping OpenCode logs..." >&2
+              find /home/runner/.local/share/opencode/log -type f -print -exec cat {} + || true
+              exit 1
+            fi
+
+      - name: Prepare artifact name
+        id: artifact
+        env:
+          BENCHMARK_AGENT: ${{ matrix.agent }}
+          BENCHMARK_MODEL: ${{ matrix.model }}
+          BENCHMARK_EVAL: ${{ matrix.eval }}
+        run: |
+          set -euo pipefail
+          agent="${BENCHMARK_AGENT//\//-}"
+          model="${BENCHMARK_MODEL//\//-}"
+          eval="${BENCHMARK_EVAL//\//-}"
+          echo "name=benchmark-${agent}-${model}-${eval}" >> "$GITHUB_OUTPUT"
+
+      - name: Upload benchmark artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ steps.artifact.outputs.name }}
+          path: benchmark.json
+
+  notify:
+    runs-on: ubuntu-latest
+    needs: benchmark
+    if: needs.benchmark.result == 'success'
+    environment: production
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Download benchmark artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: benchmarks
+
+      - name: Merge benchmark exports
+        run: bun run scripts/merge-benchmark-exports.ts benchmarks merged-benchmark.json
+
+      - name: Send Discord notification
+        env:
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
+        run: |
+          set -euo pipefail
+          if [ ! -f merged-benchmark.json ]; then
+            echo "merged-benchmark.json not found; skipping Discord notification." >&2
+            exit 0
+          fi
+
+          bun run scripts/discord-sample.ts merged-benchmark.json
@@ -0,0 +1,63 @@
+name: Compare Models
+on:
+  workflow_dispatch:
+    inputs:
+      codex_gpt_5_codex:
+        description: codex:gpt-5-codex
+        type: boolean
+        default: false
+      opencode_opencode_gpt_5_codex:
+        description: opencode:opencode/gpt-5-codex
+        type: boolean
+        default: false
+      opencode_opencode_claude_sonnet_4_5:
+        description: opencode:opencode/claude-sonnet-4-5
+        type: boolean
+        default: false
+      claude_code_claude_sonnet_4_5:
+        description: claude-code:claude-sonnet-4-5
+        type: boolean
+        default: false
+permissions:
+  contents: read
+  actions: read
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    outputs:
+      sha: ${{ steps.publish.outputs.sha }}
+      urls: ${{ steps.publish.outputs.urls }}
+      packages: ${{ steps.publish.outputs.packages }}
+      matrix: ${{ steps.build-matrix.outputs.matrix }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+      - name: Install dependencies
+        run: bun install
+      - name: Build
+        run: bun run build
+      - id: publish
+        name: Publish preview with pkg.pr.new
+        run: bunx pkg-pr-new publish --bun
+      - id: build-matrix
+        name: Build matrix from selected models
+        env:
+          WORKFLOW_INPUTS: ${{ toJSON(inputs) }}
+        run: |
+          set -euo pipefail
+
+          # Pass all inputs to the script
+          MATRIX_JSON=$(echo "$WORKFLOW_INPUTS" | bun run scripts/build-workflow-matrix.ts)
+          echo "matrix=${MATRIX_JSON}" >> "$GITHUB_OUTPUT"
+  run-benchmarks:
+    needs: publish
+    if: needs.publish.result == 'success' && needs.publish.outputs.urls != ''
+    uses: ./.github/workflows/benchmark-reusable.yml
+    with:
+      matrix: ${{ needs.publish.outputs.matrix }}
+      package_urls: ${{ needs.publish.outputs.urls }}
+    secrets: inherit