Skip to content

Commit 35b247a

Browse files
committed
update
1 parent ac6b7c5 commit 35b247a

File tree

1 file changed

+39
-37
lines changed

1 file changed

+39
-37
lines changed

cli.ts

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,36 @@ async function runEpisode(
309309
model: string,
310310
tasks: Task[],
311311
prefix: string,
312+
) {
313+
return withRetries(
314+
() => runEpisodeAttempt(evalDef, agent, model, tasks, prefix),
315+
{
316+
retries: 3,
317+
onRetry(error, attempt, retries) {
318+
const baseMessage =
319+
error instanceof Error ? error.message : String(error);
320+
console.error(
321+
`${prefix} Episode attempt ${attempt}/${retries} failed: ${baseMessage}`,
322+
);
323+
324+
if (attempt < retries) {
325+
console.log(
326+
`${prefix} Restarting episode from a clean state (attempt ${
327+
attempt + 1
328+
}/${retries})...`,
329+
);
330+
}
331+
},
332+
},
333+
);
334+
}
335+
336+
async function runEpisodeAttempt(
337+
evalDef: DatasetEval,
338+
agent: AgentRegistration,
339+
model: string,
340+
tasks: Task[],
341+
prefix: string,
312342
) {
313343
const baselineCommit = evalDef.from;
314344
let cwd: string | undefined;
@@ -344,52 +374,24 @@ async function runEpisode(
344374
let tasksExecuted = 0;
345375
let usage: Usage = { input: 0, output: 0, cost: 0 };
346376
const episodeActions: string[] = [];
347-
let episodeDuration = 0;
348377

349378
for (const task of tasks) {
350379
const logPrefix = `${prefix} ${task.commit}`;
351380

352381
try {
353-
let successfulRunDuration = 0;
354-
// TODO: retrying the agent runs here means if the agent did half of the work, the next agent would come up and continue those changes which is not correct.
355-
// the agent should start from a clean state again and do the work. so the whole loop should be restarted.
356-
const result = await withRetries(
357-
async () => {
358-
const startedAt = Date.now();
359-
const result = await agent.definition.run(
360-
model,
361-
task.prompt,
362-
cwd!,
363-
{
364-
onStart: (commandString: string) => {
365-
console.log(`${logPrefix} ${commandString.trim()}`);
366-
},
367-
logPrefix,
368-
},
369-
);
370-
successfulRunDuration = Date.now() - startedAt;
371-
return result;
372-
},
382+
const startedAt = Date.now();
383+
const result = await agent.definition.run(
384+
model,
385+
task.prompt,
386+
cwd!,
373387
{
374-
retries: 3,
375-
onRetry(error, attempt, retries) {
376-
const baseMessage =
377-
error instanceof Error ? error.message : String(error);
378-
console.error(
379-
`${logPrefix} Failed to render command for ${model} (attempt ${attempt}/${retries}): ${baseMessage}`,
380-
);
381-
382-
if (attempt < retries) {
383-
console.log(
384-
`${logPrefix} Retrying agent run (attempt ${
385-
attempt + 1
386-
}/${retries})...`,
387-
);
388-
}
388+
onStart: (commandString: string) => {
389+
console.log(`${logPrefix} ${commandString.trim()}`);
389390
},
391+
logPrefix,
390392
},
391393
);
392-
episodeDuration += successfulRunDuration;
394+
episodeDuration += Date.now() - startedAt;
393395

394396
// Only accumulate usage from the successful result
395397
usage.input += result.usage.input;

0 commit comments

Comments
 (0)