Skip to content

Commit 58e8458

Browse files
committed
debug workflow dispatch for dax
1 parent 3bc37da commit 58e8458

8 files changed

Lines changed: 671 additions & 327 deletions

File tree

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
name: Reusable Benchmark Workflow
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
matrix:
7+
description: 'JSON matrix with agent/model/eval combinations'
8+
required: true
9+
type: string
10+
package_urls:
11+
description: 'Package URLs from publish step'
12+
required: true
13+
type: string
14+
15+
permissions:
16+
contents: read
17+
actions: read
18+
19+
jobs:
20+
benchmark:
21+
name: Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
22+
runs-on: ubuntu-latest
23+
permissions:
24+
contents: read
25+
actions: read
26+
environment: production
27+
strategy:
28+
fail-fast: false
29+
matrix: ${{ fromJSON(inputs.matrix) }}
30+
31+
steps:
32+
- name: Checkout repository
33+
uses: actions/checkout@v4
34+
35+
- name: Setup Bun
36+
uses: oven-sh/setup-bun@v1
37+
with:
38+
bun-version: 1.2.21
39+
40+
- name: Install dependencies
41+
run: bun install --frozen-lockfile
42+
43+
- name: Install OpenCode CLI
44+
run: bun add -g opencode-ai@dev @openai/codex-sdk
45+
46+
- name: Determine benchmark job URL
47+
id: job_url
48+
env:
49+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
50+
BENCHMARK_AGENT: ${{ matrix.agent }}
51+
BENCHMARK_MODEL: ${{ matrix.model }}
52+
BENCHMARK_EVAL: ${{ matrix.eval }}
53+
run: |
54+
set -euo pipefail
55+
job_name="Benchmark ${BENCHMARK_AGENT} / ${BENCHMARK_MODEL} / ${BENCHMARK_EVAL}"
56+
jobs_endpoint="https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100"
57+
job_json="$(curl -fsSL \
58+
-H "Authorization: token ${GITHUB_TOKEN}" \
59+
-H "Accept: application/vnd.github+json" \
60+
"${jobs_endpoint}")"
61+
job_info="$(printf '%s\n' "${job_json}" \
62+
| jq -r --arg name "$job_name" 'select(type=="object" and has("jobs")) | .jobs[] | select(.name == $name) | [.id, .html_url] | @tsv' \
63+
| head -n 1)"
64+
65+
if [ -z "${job_info}" ] || [ "${job_info}" = "null" ]; then
66+
echo "Failed to determine job info for ${job_name}." >&2
67+
exit 1
68+
fi
69+
70+
IFS=$'\t' read -r job_id job_url <<<"${job_info}"
71+
72+
if [ -z "${job_id}" ] || [ "${job_id}" = "null" ]; then
73+
echo "Failed to determine job ID for ${job_name}." >&2
74+
exit 1
75+
fi
76+
77+
if [ -z "${job_url}" ] || [ "${job_url}" = "null" ]; then
78+
echo "Failed to determine job URL for ${job_name}." >&2
79+
exit 1
80+
fi
81+
82+
echo "Job ID: ${job_id}"
83+
echo "GITHUB_BENCHMARK_JOB_URL=${job_url}" >> "$GITHUB_ENV"
84+
echo "url=${job_url}" >> "$GITHUB_OUTPUT"
85+
86+
- name: Run openreval benchmark
87+
uses: nick-fields/retry@v2
88+
env:
89+
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
90+
CODEX_API_KEY: ${{ secrets.CODEX_API_KEY }}
91+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
92+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93+
GITHUB_BENCHMARK_JOB_URL: ${{ steps.job_url.outputs.url }}
94+
URLS: ${{ inputs.package_urls }}
95+
BENCHMARK_EVAL: ${{ matrix.eval }}
96+
BENCHMARK_MODEL: ${{ matrix.model }}
97+
BENCHMARK_AGENT: ${{ matrix.agent }}
98+
with:
99+
max_attempts: 3
100+
timeout_minutes: 90
101+
retry_on: error
102+
command: |
103+
set -euo pipefail
104+
PACKAGE_URL="$(printf '%s\n' "${URLS}" | awk 'NF {print $1; exit}')"
105+
106+
if [ -z "${BENCHMARK_EVAL}" ]; then
107+
echo "Matrix entry missing evaluation identifier." >&2
108+
exit 1
109+
fi
110+
111+
if [ -z "${BENCHMARK_MODEL}" ]; then
112+
echo "Matrix entry missing model identifier." >&2
113+
exit 1
114+
fi
115+
116+
if [ -z "${BENCHMARK_AGENT}" ]; then
117+
echo "Matrix entry missing agent identifier." >&2
118+
exit 1
119+
fi
120+
121+
if [ -z "${PACKAGE_URL}" ]; then
122+
echo "No package URL found in publish outputs." >&2
123+
exit 1
124+
fi
125+
126+
PACKAGE_SPEC="orvl@${PACKAGE_URL}"
127+
OUTPUT_FILE="benchmark.json"
128+
COMMAND="bunx \"${PACKAGE_SPEC}\" \"${BENCHMARK_AGENT}\" --eval \"${BENCHMARK_EVAL}\" --model \"${BENCHMARK_MODEL}\" --output \"${OUTPUT_FILE}\""
129+
130+
echo "Executing: ${COMMAND}"
131+
if ! bunx "${PACKAGE_SPEC}" "${BENCHMARK_AGENT}" --eval "${BENCHMARK_EVAL}" --model "${BENCHMARK_MODEL}" --output "${OUTPUT_FILE}"; then
132+
echo "openreval benchmark failed, dumping OpenCode logs..." >&2
133+
find /home/runner/.local/share/opencode/log -type f -print -exec cat {} + || true
134+
exit 1
135+
fi
136+
137+
- name: Prepare artifact name
138+
id: artifact
139+
env:
140+
BENCHMARK_AGENT: ${{ matrix.agent }}
141+
BENCHMARK_MODEL: ${{ matrix.model }}
142+
BENCHMARK_EVAL: ${{ matrix.eval }}
143+
run: |
144+
set -euo pipefail
145+
agent="${BENCHMARK_AGENT//\//-}"
146+
model="${BENCHMARK_MODEL//\//-}"
147+
eval="${BENCHMARK_EVAL//\//-}"
148+
echo "name=benchmark-${agent}-${model}-${eval}" >> "$GITHUB_OUTPUT"
149+
150+
- name: Upload benchmark artifact
151+
uses: actions/upload-artifact@v4
152+
with:
153+
name: ${{ steps.artifact.outputs.name }}
154+
path: benchmark.json
155+
156+
notify:
157+
runs-on: ubuntu-latest
158+
needs: benchmark
159+
if: needs.benchmark.result == 'success'
160+
environment: production
161+
steps:
162+
- name: Checkout repository
163+
uses: actions/checkout@v4
164+
165+
- name: Setup Bun
166+
uses: oven-sh/setup-bun@v1
167+
with:
168+
bun-version: 1.2.21
169+
170+
- name: Install dependencies
171+
run: bun install --frozen-lockfile
172+
173+
- name: Download benchmark artifacts
174+
uses: actions/download-artifact@v4
175+
with:
176+
path: benchmarks
177+
178+
- name: Merge benchmark exports
179+
run: bun run scripts/merge-benchmark-exports.ts benchmarks merged-benchmark.json
180+
181+
- name: Send Discord notification
182+
env:
183+
DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
184+
run: |
185+
set -euo pipefail
186+
if [ ! -f merged-benchmark.json ]; then
187+
echo "merged-benchmark.json not found; skipping Discord notification." >&2
188+
exit 0
189+
fi
190+
191+
bun run scripts/discord-sample.ts merged-benchmark.json
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
name: Compare Models
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
# OpenCode agent models
7+
opencode_opencode_gpt_5_codex:
8+
description: 'opencode:opencode/gpt-5-codex'
9+
type: boolean
10+
default: false
11+
opencode_opencode_claude_sonnet_4_5:
12+
description: 'opencode:opencode/claude-sonnet-4-5'
13+
type: boolean
14+
default: false
15+
# Codex agent models
16+
codex_gpt_5_codex:
17+
description: 'codex:gpt-5-codex'
18+
type: boolean
19+
default: false
20+
codex_gpt_5:
21+
description: 'codex:gpt-5'
22+
type: boolean
23+
default: false
24+
# Claude Code agent models
25+
claude_code_claude_sonnet_4_5:
26+
description: 'claude-code:claude-sonnet-4-5'
27+
type: boolean
28+
default: false
29+
30+
permissions:
31+
contents: read
32+
actions: read
33+
34+
jobs:
35+
publish:
36+
runs-on: ubuntu-latest
37+
outputs:
38+
sha: ${{ steps.publish.outputs.sha }}
39+
urls: ${{ steps.publish.outputs.urls }}
40+
packages: ${{ steps.publish.outputs.packages }}
41+
matrix: ${{ steps.build-matrix.outputs.matrix }}
42+
43+
steps:
44+
- name: Checkout repository
45+
uses: actions/checkout@v4
46+
47+
- name: Setup Bun
48+
uses: oven-sh/setup-bun@v1
49+
with:
50+
bun-version: 1.2.21
51+
52+
- name: Install dependencies
53+
run: bun install
54+
55+
- name: Build
56+
run: bun run build
57+
58+
- id: publish
59+
name: Publish preview with pkg.pr.new
60+
run: bunx pkg-pr-new publish --bun
61+
62+
- id: build-matrix
63+
name: Build matrix from selected models
64+
env:
65+
WORKFLOW_INPUTS: ${{ toJSON(inputs) }}
66+
run: |
67+
set -euo pipefail
68+
69+
# Pass all inputs to the script
70+
MATRIX_JSON=$(echo "$WORKFLOW_INPUTS" | bun run scripts/build-workflow-matrix.ts)
71+
echo "matrix=${MATRIX_JSON}" >> "$GITHUB_OUTPUT"
72+
73+
run-benchmarks:
74+
needs: publish
75+
if: needs.publish.result == 'success' && needs.publish.outputs.urls != ''
76+
uses: ./.github/workflows/benchmark-reusable.yml
77+
with:
78+
matrix: ${{ needs.publish.outputs.matrix }}
79+
package_urls: ${{ needs.publish.outputs.urls }}
80+
secrets: inherit

0 commit comments

Comments
 (0)