Skip to content

Commit 0303fcb

Browse files
Merge branch 'main' into tmd/add-log-summaries-and-token-usage
2 parents 94850d2 + 7a023b1 commit 0303fcb

11 files changed

Lines changed: 779 additions & 328 deletions

File tree

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
name: Reusable Benchmark Workflow
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
matrix:
7+
description: 'JSON matrix with agent/model/eval combinations'
8+
required: true
9+
type: string
10+
package_urls:
11+
description: 'Package URLs from publish step'
12+
required: true
13+
type: string
14+
15+
permissions:
16+
contents: read
17+
actions: read
18+
19+
jobs:
20+
benchmark:
21+
name: Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
22+
runs-on: ubuntu-latest
23+
permissions:
24+
contents: read
25+
actions: read
26+
environment: production
27+
strategy:
28+
fail-fast: false
29+
matrix: ${{ fromJSON(inputs.matrix) }}
30+
31+
steps:
32+
- name: Checkout repository
33+
uses: actions/checkout@v4
34+
35+
- name: Setup Bun
36+
uses: oven-sh/setup-bun@v1
37+
with:
38+
bun-version: 1.2.21
39+
40+
- name: Install dependencies
41+
run: bun install --frozen-lockfile
42+
43+
- name: Install OpenCode CLI
44+
run: bun add -g opencode-ai@dev @openai/codex-sdk
45+
46+
- name: Determine benchmark job URL
47+
id: job_url
48+
env:
49+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
50+
BENCHMARK_AGENT: ${{ matrix.agent }}
51+
BENCHMARK_MODEL: ${{ matrix.model }}
52+
BENCHMARK_EVAL: ${{ matrix.eval }}
53+
run: |
54+
set -euo pipefail
55+
# When using reusable workflows, job names get prefixed, so we search for jobs containing our pattern
56+
job_pattern="Benchmark ${BENCHMARK_AGENT} / ${BENCHMARK_MODEL} / ${BENCHMARK_EVAL}"
57+
jobs_endpoint="https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100"
58+
job_json="$(curl -fsSL \
59+
-H "Authorization: token ${GITHUB_TOKEN}" \
60+
-H "Accept: application/vnd.github+json" \
61+
"${jobs_endpoint}")"
62+
63+
# Try to find job by exact name match first, then by pattern match
64+
job_info="$(printf '%s\n' "${job_json}" \
65+
| jq -r --arg pattern "$job_pattern" 'select(type=="object" and has("jobs")) | .jobs[] | select(.name | contains($pattern)) | select(.status == "in_progress") | [.id, .html_url] | @tsv' \
66+
| head -n 1)"
67+
68+
if [ -z "${job_info}" ] || [ "${job_info}" = "null" ]; then
69+
echo "Failed to determine job info for pattern: ${job_pattern}" >&2
70+
echo "Available jobs:" >&2
71+
printf '%s\n' "${job_json}" | jq -r '.jobs[]?.name' >&2 || true
72+
exit 1
73+
fi
74+
75+
IFS=$'\t' read -r job_id job_url <<<"${job_info}"
76+
77+
if [ -z "${job_id}" ] || [ "${job_id}" = "null" ]; then
78+
echo "Failed to determine job ID for pattern: ${job_pattern}" >&2
79+
exit 1
80+
fi
81+
82+
if [ -z "${job_url}" ] || [ "${job_url}" = "null" ]; then
83+
echo "Failed to determine job URL for pattern: ${job_pattern}" >&2
84+
exit 1
85+
fi
86+
87+
echo "Job ID: ${job_id}"
88+
echo "Job URL: ${job_url}"
89+
echo "GITHUB_BENCHMARK_JOB_URL=${job_url}" >> "$GITHUB_ENV"
90+
echo "url=${job_url}" >> "$GITHUB_OUTPUT"
91+
92+
- name: Run openreval benchmark
93+
uses: nick-fields/retry@v2
94+
env:
95+
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
96+
CODEX_API_KEY: ${{ secrets.CODEX_API_KEY }}
97+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
98+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
99+
GITHUB_BENCHMARK_JOB_URL: ${{ steps.job_url.outputs.url }}
100+
URLS: ${{ inputs.package_urls }}
101+
BENCHMARK_EVAL: ${{ matrix.eval }}
102+
BENCHMARK_MODEL: ${{ matrix.model }}
103+
BENCHMARK_AGENT: ${{ matrix.agent }}
104+
with:
105+
max_attempts: 3
106+
timeout_minutes: 90
107+
retry_on: error
108+
command: |
109+
set -euo pipefail
110+
PACKAGE_URL="$(printf '%s\n' "${URLS}" | awk 'NF {print $1; exit}')"
111+
112+
if [ -z "${BENCHMARK_EVAL}" ]; then
113+
echo "Matrix entry missing evaluation identifier." >&2
114+
exit 1
115+
fi
116+
117+
if [ -z "${BENCHMARK_MODEL}" ]; then
118+
echo "Matrix entry missing model identifier." >&2
119+
exit 1
120+
fi
121+
122+
if [ -z "${BENCHMARK_AGENT}" ]; then
123+
echo "Matrix entry missing agent identifier." >&2
124+
exit 1
125+
fi
126+
127+
if [ -z "${PACKAGE_URL}" ]; then
128+
echo "No package URL found in publish outputs." >&2
129+
exit 1
130+
fi
131+
132+
PACKAGE_SPEC="orvl@${PACKAGE_URL}"
133+
OUTPUT_FILE="benchmark.json"
134+
COMMAND="bunx \"${PACKAGE_SPEC}\" \"${BENCHMARK_AGENT}\" --eval \"${BENCHMARK_EVAL}\" --model \"${BENCHMARK_MODEL}\" --output \"${OUTPUT_FILE}\""
135+
136+
echo "Executing: ${COMMAND}"
137+
if ! bunx "${PACKAGE_SPEC}" "${BENCHMARK_AGENT}" --eval "${BENCHMARK_EVAL}" --model "${BENCHMARK_MODEL}" --output "${OUTPUT_FILE}"; then
138+
echo "openreval benchmark failed, dumping OpenCode logs..." >&2
139+
find /home/runner/.local/share/opencode/log -type f -print -exec cat {} + || true
140+
exit 1
141+
fi
142+
143+
- name: Prepare artifact name
144+
id: artifact
145+
env:
146+
BENCHMARK_AGENT: ${{ matrix.agent }}
147+
BENCHMARK_MODEL: ${{ matrix.model }}
148+
BENCHMARK_EVAL: ${{ matrix.eval }}
149+
run: |
150+
set -euo pipefail
151+
agent="${BENCHMARK_AGENT//\//-}"
152+
model="${BENCHMARK_MODEL//\//-}"
153+
eval="${BENCHMARK_EVAL//\//-}"
154+
echo "name=benchmark-${agent}-${model}-${eval}" >> "$GITHUB_OUTPUT"
155+
156+
- name: Upload benchmark artifact
157+
uses: actions/upload-artifact@v4
158+
with:
159+
name: ${{ steps.artifact.outputs.name }}
160+
path: benchmark.json
161+
162+
notify:
163+
runs-on: ubuntu-latest
164+
needs: benchmark
165+
if: needs.benchmark.result == 'success'
166+
environment: production
167+
steps:
168+
- name: Checkout repository
169+
uses: actions/checkout@v4
170+
171+
- name: Setup Bun
172+
uses: oven-sh/setup-bun@v1
173+
with:
174+
bun-version: 1.2.21
175+
176+
- name: Install dependencies
177+
run: bun install --frozen-lockfile
178+
179+
- name: Download benchmark artifacts
180+
uses: actions/download-artifact@v4
181+
with:
182+
path: benchmarks
183+
184+
- name: Merge benchmark exports
185+
run: bun run scripts/merge-benchmark-exports.ts benchmarks merged-benchmark.json
186+
187+
- name: Send Discord notification
188+
env:
189+
DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
190+
run: |
191+
set -euo pipefail
192+
if [ ! -f merged-benchmark.json ]; then
193+
echo "merged-benchmark.json not found; skipping Discord notification." >&2
194+
exit 0
195+
fi
196+
197+
bun run scripts/discord-sample.ts merged-benchmark.json
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
name: Compare Models
2+
on:
3+
workflow_dispatch:
4+
inputs:
5+
codex_gpt_5_codex:
6+
description: codex:gpt-5-codex
7+
type: boolean
8+
default: false
9+
opencode_opencode_gpt_5_codex:
10+
description: opencode:opencode/gpt-5-codex
11+
type: boolean
12+
default: false
13+
opencode_opencode_claude_sonnet_4_5:
14+
description: opencode:opencode/claude-sonnet-4-5
15+
type: boolean
16+
default: false
17+
claude_code_claude_sonnet_4_5:
18+
description: claude-code:claude-sonnet-4-5
19+
type: boolean
20+
default: false
21+
permissions:
22+
contents: read
23+
actions: read
24+
jobs:
25+
publish:
26+
runs-on: ubuntu-latest
27+
outputs:
28+
sha: ${{ steps.publish.outputs.sha }}
29+
urls: ${{ steps.publish.outputs.urls }}
30+
packages: ${{ steps.publish.outputs.packages }}
31+
matrix: ${{ steps.build-matrix.outputs.matrix }}
32+
steps:
33+
- name: Checkout repository
34+
uses: actions/checkout@v4
35+
- name: Setup Bun
36+
uses: oven-sh/setup-bun@v1
37+
with:
38+
bun-version: 1.2.21
39+
- name: Install dependencies
40+
run: bun install
41+
- name: Build
42+
run: bun run build
43+
- id: publish
44+
name: Publish preview with pkg.pr.new
45+
run: bunx pkg-pr-new publish --bun
46+
- id: build-matrix
47+
name: Build matrix from selected models
48+
env:
49+
WORKFLOW_INPUTS: ${{ toJSON(inputs) }}
50+
run: |
51+
set -euo pipefail
52+
53+
# Pass all inputs to the script
54+
MATRIX_JSON=$(echo "$WORKFLOW_INPUTS" | bun run scripts/build-workflow-matrix.ts)
55+
echo "matrix=${MATRIX_JSON}" >> "$GITHUB_OUTPUT"
56+
run-benchmarks:
57+
needs: publish
58+
if: needs.publish.result == 'success' && needs.publish.outputs.urls != ''
59+
uses: ./.github/workflows/benchmark-reusable.yml
60+
with:
61+
matrix: ${{ needs.publish.outputs.matrix }}
62+
package_urls: ${{ needs.publish.outputs.urls }}
63+
secrets: inherit

0 commit comments

Comments
 (0)