update: create prompts for each eval

tmickleydoyle · tmickleydoyle · commit 12f67dc84939 · 2025-10-30T07:54:57.000-04:00
diff --git a/cli.ts b/cli.ts
@@ -208,33 +208,54 @@ async function main(): Promise<void> {
 
     const evalId = evalDefinition.repo;
 
-    let plannerTasks: PlannerTask[] = [];
+    let evaluationPrompt: string;
 
-    try {
-      console.log(`[${evalId} planner] Fetching commit diffs from GitHub...`);
-      const commitDiffs = await fetchPlannerCommitDiffs(evalDefinition);
-
-      assert(
-        commitDiffs.length > 0,
-        `No commits found between ${evalDefinition.from} and ${evalDefinition.to} for ${evalDefinition.repo}.`,
+    // Check if a pre-generated prompt exists
+    if (evalDefinition.prompt) {
+      console.log(`[${evalId}] Using pre-generated prompt from dataset.yaml`);
+      evaluationPrompt = evalDefinition.prompt;
+    } else {
+      console.warn(
+        `[${evalId}] WARNING: No pre-generated prompt found. Falling back to dynamic generation.`,
+      );
+      console.warn(
+        `[${evalId}] Run 'bun run scripts/generate-prompts.ts --repo ${evalDefinition.repo}' to generate stable prompts.`,
       );
 
-      plannerTasks = await generatePlannerTasks(evalDefinition, commitDiffs);
+      try {
+        console.log(`[${evalId} planner] Fetching commit diffs from GitHub...`);
+        const commitDiffs = await fetchPlannerCommitDiffs(evalDefinition);
 
-      assert(
-        plannerTasks.length > 0,
-        `Planner produced no tasks for ${evalDefinition.repo} (${evalDefinition.from}..${evalDefinition.to}).`,
-      );
-    } catch (error) {
-      if (error instanceof Error) {
-        console.error(
-          `Failed to prepare evaluation ${evalId}: ${error.message}`,
+        assert(
+          commitDiffs.length > 0,
+          `No commits found between ${evalDefinition.from} and ${evalDefinition.to} for ${evalDefinition.repo}.`,
         );
-      } else {
-        console.error("Failed to prepare evaluation", evalId);
+
+        const plannerTasks = await generatePlannerTasks(
+          evalDefinition,
+          commitDiffs,
+        );
+
+        assert(
+          plannerTasks.length > 0,
+          `Planner produced no tasks for ${evalDefinition.repo} (${evalDefinition.from}..${evalDefinition.to}).`,
+        );
+
+        // Combine all task prompts into a single prompt (fallback behavior)
+        evaluationPrompt = plannerTasks
+          .map((task, idx) => `${idx + 1}. ${task.prompt}`)
+          .join("\n\n");
+      } catch (error) {
+        if (error instanceof Error) {
+          console.error(
+            `Failed to prepare evaluation ${evalId}: ${error.message}`,
+          );
+        } else {
+          console.error("Failed to prepare evaluation", evalId);
+        }
+        process.exitCode = 1;
+        assert(false, "evaluation preparation failed");
       }
-      process.exitCode = 1;
-      assert(false, "evaluation preparation failed");
     }
 
     const executeCombination = async (): Promise<{
@@ -293,56 +314,43 @@ async function main(): Promise<void> {
             }
           }
 
-          let tasksExecuted = 0;
-
-          for (const task of plannerTasks) {
-            const logPrefix = `${prefix} ${task.commit}`;
-
-            try {
-              await withRetries(
-                async () => {
-                  await agentRegistration.definition.run(
-                    model,
-                    task.prompt,
-                    cwd!,
-                    {
-                      onStart: (commandString: string) => {
-                        console.log(`${logPrefix} ${commandString.trim()}`);
-                      },
-                      logPrefix,
+          // Run the agent once with the evaluation prompt
+          try {
+            await withRetries(
+              async () => {
+                await agentRegistration.definition.run(
+                  model,
+                  evaluationPrompt,
+                  cwd!,
+                  {
+                    onStart: (commandString: string) => {
+                      console.log(`${prefix} ${commandString.trim()}`);
                     },
+                    logPrefix: prefix,
+                  },
+                );
+              },
+              {
+                retries: 3,
+                onRetry(error, attempt, retries) {
+                  const baseMessage =
+                    error instanceof Error ? error.message : String(error);
+                  console.error(
+                    `${prefix} Failed to render command for ${model} (attempt ${attempt}/${retries}): ${baseMessage}`,
                   );
-                },
-                {
-                  retries: 3,
-                  onRetry(error, attempt, retries) {
-                    const baseMessage =
-                      error instanceof Error ? error.message : String(error);
-                    console.error(
-                      `${logPrefix} Failed to render command for ${model} (attempt ${attempt}/${retries}): ${baseMessage}`,
-                    );
 
-                    if (attempt < retries) {
-                      console.log(
-                        `${logPrefix} Retrying agent run (attempt ${attempt + 1}/${retries})...`,
-                      );
-                    }
-                  },
+                  if (attempt < retries) {
+                    console.log(
+                      `${prefix} Retrying agent run (attempt ${attempt + 1}/${retries})...`,
+                    );
+                  }
                 },
-              );
-            } catch (error) {
-              const message =
-                error instanceof Error ? error.message : String(error);
-              fail(
-                `Agent run failed for planner task ${task.commit}: ${message}`,
-              );
-            }
-
-            tasksExecuted += 1;
-          }
-
-          if (tasksExecuted === 0) {
-            fail("No planner tasks have been executed.");
+              },
+            );
+          } catch (error) {
+            const message =
+              error instanceof Error ? error.message : String(error);
+            fail(`Agent run failed: ${message}`);
           }
 
           const hasChanges = finalizeAgentChanges(
diff --git a/dataset.yaml b/dataset.yaml
@@ -1,6 +1,6 @@
-- repo: prismicio-community/course-fizzi-next
-  from: 15037446358508e153e765da49f8f5defa7fbbf6
-  to: 2760114f2647ebec8f63e0ecc2dc87a8cd4096ac
+- repo: "prismicio-community/course-fizzi-next"
+  from: "15037446358508e153e765da49f8f5defa7fbbf6"
+  to: "2760114f2647ebec8f63e0ecc2dc87a8cd4096ac"
   issues: []
   scores:
     api-signature:
@@ -15,13 +15,14 @@
       weight: 0.1
       args:
         setup:
-          - npm ci
+          - "npm ci"
         commands:
-          - npm run lint
-          - npm exec prettier -- --check .
-- repo: DataDog/datadog-lambda-python
-  from: 93d4a07fa61a4d4d2feec08e722505a9e0cc8657
-  to: d7763789f262b2da228f8210509e302e6e510d0a
+          - "npm run lint"
+          - "npm exec prettier -- --check ."
+  prompt: "Update the project to provide a streamlined content setup experience for users who want to skip the tutorial. Replace the old migration script with a new automated content setup script that handles authentication, fetches documents from a source repository, and copies them to the user's repository. Update the README to include clear instructions for launching the site without going through the tutorial, including steps to clone the code, select the locale, run the setup script, publish the migration release, and configure the slice simulator URL. Change the package name from \"nextjs-starter-prismic-minimal\" to \"fizzi\" in package.json and package-lock.json. Add a new npm script called \"set-up-content\" that runs the setup script via npx tsx. The new setup script should automatically handle login if needed (by opening a browser session), fetch all documents from a source repository matching the package name, create a migration that copies those documents with human-readable release names derived from document UIDs or types, and print a URL to the migration release for publishing. Remove the old migrate.ts file and the documents directory entirely since content will now be pulled from the source repository dynamically. Ensure the script provides clear console output at each step to guide the user through the process."
+- repo: "DataDog/datadog-lambda-python"
+  from: "93d4a07fa61a4d4d2feec08e722505a9e0cc8657"
+  to: "d7763789f262b2da228f8210509e302e6e510d0a"
   issues: []
   scores:
     api-signature:
@@ -36,15 +37,15 @@
       weight: 0.1
       args:
         setup:
-          - python3 -m venv .venv
-          - ./.venv/bin/python -m pip install ".[dev]"
+          - "python3 -m venv .venv"
+          - "./.venv/bin/python -m pip install \".[dev]\""
         commands:
-          - ./.venv/bin/pytest -vv
-          - ./.venv/bin/flake8 datadog_lambda/
-
-- repo: AlaminPu1007/algorithm-visualizer
-  from: ca409519ec96a83ec8d6c2ba30f2487f8d601719
-  to: 21845e972dd8e2378cbcd16accc5ae8cdd37acb2
+          - "./.venv/bin/pytest -vv"
+          - "./.venv/bin/flake8 datadog_lambda/"
+  prompt: "Add a metric to track Lambda batch item failures. When Lambda functions return a response containing batch item failures (the batchItemFailures field), emit a count of how many items failed as an enhanced metric. This should only happen when enhanced metrics are enabled and the response structure is valid. The metric should be submitted automatically after each Lambda invocation by integrating it into the wrapper's post-execution hook. Follow existing codebase patterns for function signatures, metric submission, and integration points. Include comprehensive test coverage for various scenarios including responses with failures, empty failures, missing fields, non-dict responses, invalid field types, disabled enhanced metrics, and integration with the wrapper."
+- repo: "AlaminPu1007/algorithm-visualizer"
+  from: "ca409519ec96a83ec8d6c2ba30f2487f8d601719"
+  to: "21845e972dd8e2378cbcd16accc5ae8cdd37acb2"
   issues: []
   scores:
     api-signature:
@@ -59,7 +60,8 @@
       weight: 0.1
       args:
         setup:
-          - npm ci
+          - "npm ci"
         commands:
-          - npm run lint
-          - npm run build
+          - "npm run lint"
+          - "npm run build"
+  prompt: "Ship version 1.6.0 with Floyd Warshall algorithm support and UI refinements. Update the version number in package.json to 1.6.0 and add a corresponding changelog entry documenting the Floyd Warshall algorithm implementation, loading state management during algorithm execution, and error handling for development mode. In the README, update the shortest path finding section to list Bellman-Ford, Floyd Warshall, and Dijkstra's algorithms (with backticks for formatting), and clean up extra blank lines between feature sections. Add Floyd Warshall to the projects data schema as a new entry in the appropriate algorithm list. Refactor the PathFind component layout: change the default button type from 'shortest-path' to 'unique-path', reorganize the control panel to right-align and adjust responsive breakpoints from 410px to 600px where appropriate, reorder the algorithm type dropdown options so Unique Path and No of island appear before Shortest Path, and make the container positioning relative with the ShortestPath status plate absolutely positioned. Update the ShortestPath, UniquePath, and NoOfIslands components to remove their individual container divs and replace them with fragments or empty tags, allowing the parent PathFind component to manage layout. Ensure all changes maintain existing functionality and follow the codebase's component structure patterns."
diff --git a/lib/dataset.ts b/lib/dataset.ts
@@ -16,6 +16,7 @@ const datasetSchema = z.array(
       .regex(/^[^/]+\/[^/]+$/, "repo must follow the format <owner>/<name>."),
     from: z.string().min(1, "from commit SHA is required."),
     to: z.string().min(1, "to commit SHA is required."),
+    prompt: z.string().optional(),
     issues: z.array(z.number().int()),
     scores: z.record(scoreConfigSchema)
   })
diff --git a/lib/planner.ts b/lib/planner.ts
@@ -128,3 +128,41 @@ export async function generatePlannerTasks(
 
   return tasks;
 }
+
+const singlePromptSchema = z.object({
+  prompt: z.string().min(1),
+});
+
+export async function generateSinglePrompt(
+  entry: DatasetEval,
+  fullDiff: string,
+): Promise<string> {
+  const truncatedDiff =
+    fullDiff.length > 50_000
+      ? `${fullDiff.slice(0, 50_000)}\n... [truncated]`
+      : fullDiff;
+
+  try {
+    const result = await generateObject({
+      model: getZenLanguageModel(plannerModelId),
+      schema: singlePromptSchema,
+      system: buildSystemPrompt(),
+      temperature: 0,
+      prompt: `Repository: ${entry.repo}
+Base commit: ${entry.from}
+Target commit: ${entry.to}
+
+Complete diff showing all changes:
+${truncatedDiff}
+
+Generate a single comprehensive prompt that describes all changes needed to transform the codebase from the base commit to the target commit. Return the JSON object with the prompt.`,
+    });
+
+    return sanitizePlannerPrompt(result.object.prompt);
+  } catch (error) {
+    const formatted =
+      error instanceof Error ? error : new Error(String(error));
+    formatted.message = `Planner failed to generate prompt for ${entry.repo}: ${formatted.message}`;
+    throw formatted;
+  }
+}
diff --git a/scripts/generate-prompts.ts b/scripts/generate-prompts.ts