Skip to content

Commit 12f67dc

Browse files
committed
update: create prompts for each eval
1 parent a69a3fa commit 12f67dc

5 files changed

Lines changed: 300 additions & 87 deletions

File tree

cli.ts

Lines changed: 75 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -208,33 +208,54 @@ async function main(): Promise<void> {
208208

209209
const evalId = evalDefinition.repo;
210210

211-
let plannerTasks: PlannerTask[] = [];
211+
let evaluationPrompt: string;
212212

213-
try {
214-
console.log(`[${evalId} planner] Fetching commit diffs from GitHub...`);
215-
const commitDiffs = await fetchPlannerCommitDiffs(evalDefinition);
216-
217-
assert(
218-
commitDiffs.length > 0,
219-
`No commits found between ${evalDefinition.from} and ${evalDefinition.to} for ${evalDefinition.repo}.`,
213+
// Check if a pre-generated prompt exists
214+
if (evalDefinition.prompt) {
215+
console.log(`[${evalId}] Using pre-generated prompt from dataset.yaml`);
216+
evaluationPrompt = evalDefinition.prompt;
217+
} else {
218+
console.warn(
219+
`[${evalId}] WARNING: No pre-generated prompt found. Falling back to dynamic generation.`,
220+
);
221+
console.warn(
222+
`[${evalId}] Run 'bun run scripts/generate-prompts.ts --repo ${evalDefinition.repo}' to generate stable prompts.`,
220223
);
221224

222-
plannerTasks = await generatePlannerTasks(evalDefinition, commitDiffs);
225+
try {
226+
console.log(`[${evalId} planner] Fetching commit diffs from GitHub...`);
227+
const commitDiffs = await fetchPlannerCommitDiffs(evalDefinition);
223228

224-
assert(
225-
plannerTasks.length > 0,
226-
`Planner produced no tasks for ${evalDefinition.repo} (${evalDefinition.from}..${evalDefinition.to}).`,
227-
);
228-
} catch (error) {
229-
if (error instanceof Error) {
230-
console.error(
231-
`Failed to prepare evaluation ${evalId}: ${error.message}`,
229+
assert(
230+
commitDiffs.length > 0,
231+
`No commits found between ${evalDefinition.from} and ${evalDefinition.to} for ${evalDefinition.repo}.`,
232232
);
233-
} else {
234-
console.error("Failed to prepare evaluation", evalId);
233+
234+
const plannerTasks = await generatePlannerTasks(
235+
evalDefinition,
236+
commitDiffs,
237+
);
238+
239+
assert(
240+
plannerTasks.length > 0,
241+
`Planner produced no tasks for ${evalDefinition.repo} (${evalDefinition.from}..${evalDefinition.to}).`,
242+
);
243+
244+
// Combine all task prompts into a single prompt (fallback behavior)
245+
evaluationPrompt = plannerTasks
246+
.map((task, idx) => `${idx + 1}. ${task.prompt}`)
247+
.join("\n\n");
248+
} catch (error) {
249+
if (error instanceof Error) {
250+
console.error(
251+
`Failed to prepare evaluation ${evalId}: ${error.message}`,
252+
);
253+
} else {
254+
console.error("Failed to prepare evaluation", evalId);
255+
}
256+
process.exitCode = 1;
257+
assert(false, "evaluation preparation failed");
235258
}
236-
process.exitCode = 1;
237-
assert(false, "evaluation preparation failed");
238259
}
239260

240261
const executeCombination = async (): Promise<{
@@ -293,56 +314,43 @@ async function main(): Promise<void> {
293314
}
294315
}
295316

296-
let tasksExecuted = 0;
297-
298-
for (const task of plannerTasks) {
299-
const logPrefix = `${prefix} ${task.commit}`;
300-
301-
try {
302-
await withRetries(
303-
async () => {
304-
await agentRegistration.definition.run(
305-
model,
306-
task.prompt,
307-
cwd!,
308-
{
309-
onStart: (commandString: string) => {
310-
console.log(`${logPrefix} ${commandString.trim()}`);
311-
},
312-
logPrefix,
317+
// Run the agent once with the evaluation prompt
318+
try {
319+
await withRetries(
320+
async () => {
321+
await agentRegistration.definition.run(
322+
model,
323+
evaluationPrompt,
324+
cwd!,
325+
{
326+
onStart: (commandString: string) => {
327+
console.log(`${prefix} ${commandString.trim()}`);
313328
},
329+
logPrefix: prefix,
330+
},
331+
);
332+
},
333+
{
334+
retries: 3,
335+
onRetry(error, attempt, retries) {
336+
const baseMessage =
337+
error instanceof Error ? error.message : String(error);
338+
console.error(
339+
`${prefix} Failed to render command for ${model} (attempt ${attempt}/${retries}): ${baseMessage}`,
314340
);
315-
},
316-
{
317-
retries: 3,
318-
onRetry(error, attempt, retries) {
319-
const baseMessage =
320-
error instanceof Error ? error.message : String(error);
321-
console.error(
322-
`${logPrefix} Failed to render command for ${model} (attempt ${attempt}/${retries}): ${baseMessage}`,
323-
);
324341

325-
if (attempt < retries) {
326-
console.log(
327-
`${logPrefix} Retrying agent run (attempt ${attempt + 1}/${retries})...`,
328-
);
329-
}
330-
},
342+
if (attempt < retries) {
343+
console.log(
344+
`${prefix} Retrying agent run (attempt ${attempt + 1}/${retries})...`,
345+
);
346+
}
331347
},
332-
);
333-
} catch (error) {
334-
const message =
335-
error instanceof Error ? error.message : String(error);
336-
fail(
337-
`Agent run failed for planner task ${task.commit}: ${message}`,
338-
);
339-
}
340-
341-
tasksExecuted += 1;
342-
}
343-
344-
if (tasksExecuted === 0) {
345-
fail("No planner tasks have been executed.");
348+
},
349+
);
350+
} catch (error) {
351+
const message =
352+
error instanceof Error ? error.message : String(error);
353+
fail(`Agent run failed: ${message}`);
346354
}
347355

348356
const hasChanges = finalizeAgentChanges(

dataset.yaml

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
- repo: prismicio-community/course-fizzi-next
2-
from: 15037446358508e153e765da49f8f5defa7fbbf6
3-
to: 2760114f2647ebec8f63e0ecc2dc87a8cd4096ac
1+
- repo: "prismicio-community/course-fizzi-next"
2+
from: "15037446358508e153e765da49f8f5defa7fbbf6"
3+
to: "2760114f2647ebec8f63e0ecc2dc87a8cd4096ac"
44
issues: []
55
scores:
66
api-signature:
@@ -15,13 +15,14 @@
1515
weight: 0.1
1616
args:
1717
setup:
18-
- npm ci
18+
- "npm ci"
1919
commands:
20-
- npm run lint
21-
- npm exec prettier -- --check .
22-
- repo: DataDog/datadog-lambda-python
23-
from: 93d4a07fa61a4d4d2feec08e722505a9e0cc8657
24-
to: d7763789f262b2da228f8210509e302e6e510d0a
20+
- "npm run lint"
21+
- "npm exec prettier -- --check ."
22+
prompt: "Update the project to provide a streamlined content setup experience for users who want to skip the tutorial. Replace the old migration script with a new automated content setup script that handles authentication, fetches documents from a source repository, and copies them to the user's repository. Update the README to include clear instructions for launching the site without going through the tutorial, including steps to clone the code, select the locale, run the setup script, publish the migration release, and configure the slice simulator URL. Change the package name from \"nextjs-starter-prismic-minimal\" to \"fizzi\" in package.json and package-lock.json. Add a new npm script called \"set-up-content\" that runs the setup script via npx tsx. The new setup script should automatically handle login if needed (by opening a browser session), fetch all documents from a source repository matching the package name, create a migration that copies those documents with human-readable release names derived from document UIDs or types, and print a URL to the migration release for publishing. Remove the old migrate.ts file and the documents directory entirely since content will now be pulled from the source repository dynamically. Ensure the script provides clear console output at each step to guide the user through the process."
23+
- repo: "DataDog/datadog-lambda-python"
24+
from: "93d4a07fa61a4d4d2feec08e722505a9e0cc8657"
25+
to: "d7763789f262b2da228f8210509e302e6e510d0a"
2526
issues: []
2627
scores:
2728
api-signature:
@@ -36,15 +37,15 @@
3637
weight: 0.1
3738
args:
3839
setup:
39-
- python3 -m venv .venv
40-
- ./.venv/bin/python -m pip install ".[dev]"
40+
- "python3 -m venv .venv"
41+
- "./.venv/bin/python -m pip install \".[dev]\""
4142
commands:
42-
- ./.venv/bin/pytest -vv
43-
- ./.venv/bin/flake8 datadog_lambda/
44-
45-
- repo: AlaminPu1007/algorithm-visualizer
46-
from: ca409519ec96a83ec8d6c2ba30f2487f8d601719
47-
to: 21845e972dd8e2378cbcd16accc5ae8cdd37acb2
43+
- "./.venv/bin/pytest -vv"
44+
- "./.venv/bin/flake8 datadog_lambda/"
45+
prompt: "Add a metric to track Lambda batch item failures. When Lambda functions return a response containing batch item failures (the batchItemFailures field), emit a count of how many items failed as an enhanced metric. This should only happen when enhanced metrics are enabled and the response structure is valid. The metric should be submitted automatically after each Lambda invocation by integrating it into the wrapper's post-execution hook. Follow existing codebase patterns for function signatures, metric submission, and integration points. Include comprehensive test coverage for various scenarios including responses with failures, empty failures, missing fields, non-dict responses, invalid field types, disabled enhanced metrics, and integration with the wrapper."
46+
- repo: "AlaminPu1007/algorithm-visualizer"
47+
from: "ca409519ec96a83ec8d6c2ba30f2487f8d601719"
48+
to: "21845e972dd8e2378cbcd16accc5ae8cdd37acb2"
4849
issues: []
4950
scores:
5051
api-signature:
@@ -59,7 +60,8 @@
5960
weight: 0.1
6061
args:
6162
setup:
62-
- npm ci
63+
- "npm ci"
6364
commands:
64-
- npm run lint
65-
- npm run build
65+
- "npm run lint"
66+
- "npm run build"
67+
prompt: "Ship version 1.6.0 with Floyd Warshall algorithm support and UI refinements. Update the version number in package.json to 1.6.0 and add a corresponding changelog entry documenting the Floyd Warshall algorithm implementation, loading state management during algorithm execution, and error handling for development mode. In the README, update the shortest path finding section to list Bellman-Ford, Floyd Warshall, and Dijkstra's algorithms (with backticks for formatting), and clean up extra blank lines between feature sections. Add Floyd Warshall to the projects data schema as a new entry in the appropriate algorithm list. Refactor the PathFind component layout: change the default button type from 'shortest-path' to 'unique-path', reorganize the control panel to right-align and adjust responsive breakpoints from 410px to 600px where appropriate, reorder the algorithm type dropdown options so Unique Path and No of island appear before Shortest Path, and make the container positioning relative with the ShortestPath status plate absolutely positioned. Update the ShortestPath, UniquePath, and NoOfIslands components to remove their individual container divs and replace them with fragments or empty tags, allowing the parent PathFind component to manage layout. Ensure all changes maintain existing functionality and follow the codebase's component structure patterns."

lib/dataset.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ const datasetSchema = z.array(
1616
.regex(/^[^/]+\/[^/]+$/, "repo must follow the format <owner>/<name>."),
1717
from: z.string().min(1, "from commit SHA is required."),
1818
to: z.string().min(1, "to commit SHA is required."),
19+
prompt: z.string().optional(),
1920
issues: z.array(z.number().int()),
2021
scores: z.record(scoreConfigSchema)
2122
})

lib/planner.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,41 @@ export async function generatePlannerTasks(
128128

129129
return tasks;
130130
}
131+
132+
const singlePromptSchema = z.object({
133+
prompt: z.string().min(1),
134+
});
135+
136+
export async function generateSinglePrompt(
137+
entry: DatasetEval,
138+
fullDiff: string,
139+
): Promise<string> {
140+
const truncatedDiff =
141+
fullDiff.length > 50_000
142+
? `${fullDiff.slice(0, 50_000)}\n... [truncated]`
143+
: fullDiff;
144+
145+
try {
146+
const result = await generateObject({
147+
model: getZenLanguageModel(plannerModelId),
148+
schema: singlePromptSchema,
149+
system: buildSystemPrompt(),
150+
temperature: 0,
151+
prompt: `Repository: ${entry.repo}
152+
Base commit: ${entry.from}
153+
Target commit: ${entry.to}
154+
155+
Complete diff showing all changes:
156+
${truncatedDiff}
157+
158+
Generate a single comprehensive prompt that describes all changes needed to transform the codebase from the base commit to the target commit. Return the JSON object with the prompt.`,
159+
});
160+
161+
return sanitizePlannerPrompt(result.object.prompt);
162+
} catch (error) {
163+
const formatted =
164+
error instanceof Error ? error : new Error(String(error));
165+
formatted.message = `Planner failed to generate prompt for ${entry.repo}: ${formatted.message}`;
166+
throw formatted;
167+
}
168+
}

0 commit comments

Comments
 (0)