Skip to content

Commit 281508b

Browse files
committed
chore: remove logging the output of checks commands
1 parent bc32236 commit 281508b

File tree

3 files changed

+51
-57
lines changed

3 files changed

+51
-57
lines changed

cli.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ async function runEpisodeAttempt(
361361
evaluation: evalDef,
362362
cwd,
363363
config: assignment.args,
364+
logPrefix: prefix,
364365
});
365366
preparedScores.set(assignment.name, prepared);
366367
} catch (error) {
@@ -422,6 +423,7 @@ async function runEpisodeAttempt(
422423
model,
423424
cwd,
424425
preparedScores,
426+
prefix,
425427
);
426428

427429
if (episodeAggregation.size === 0) {
@@ -573,6 +575,7 @@ async function collectAggregationInputsForRun(
573575
model: ModelCombination,
574576
cwd: string,
575577
preparedReferences: Map<string, unknown>,
578+
logPrefix?: string,
576579
): Promise<Map<string, ScoreAggregationInput>> {
577580
const aggregationInputs = new Map<string, ScoreAggregationInput>();
578581

@@ -603,6 +606,7 @@ async function collectAggregationInputsForRun(
603606
evaluation: datasetEval,
604607
cwd,
605608
config: assignment.args,
609+
logPrefix,
606610
});
607611

608612
ensureAggregationEntry(aggregationInputs, assignment).judgeResults.push(

lib/createScore.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ export interface ScorePreparationContext<Config = unknown> {
1616
evaluation: DatasetEval;
1717
cwd: string;
1818
config: Config;
19+
logPrefix?: string;
1920
}
2021

2122
export interface ScoreEvaluationContext<Reference, Config = unknown> {
@@ -24,6 +25,7 @@ export interface ScoreEvaluationContext<Reference, Config = unknown> {
2425
config: Config;
2526
judge: Judge;
2627
reference: Reference;
28+
logPrefix?: string;
2729
}
2830

2931
export interface ScoreResult {

scores/checks.ts

Lines changed: 45 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -125,19 +125,19 @@ const COMMAND_TIMEOUT_MS = 5 * 60 * 1000;
125125
type ChecksConfig = z.infer<typeof commandConfigSchema>;
126126

127127
export default createScore<PreparedCheck[], ChecksConfig>({
128-
prepare: async ({ cwd, evaluation, config }) => {
128+
prepare: async ({ cwd, evaluation, config, logPrefix }) => {
129129
const parsedConfig = commandConfigSchema.parse(config ?? {});
130130

131131
for (const command of parsedConfig.setup) {
132132
const result = await runCommand(command, cwd);
133-
logSetupExecution(command, result);
133+
logSetupExecution(command, result, logPrefix);
134134
}
135135

136136
const results: PreparedCheck[] = [];
137137

138138
for (const command of parsedConfig.commands) {
139139
const baseline = await runCommand(command, cwd);
140-
logExecution("baseline", command, baseline);
140+
logExecution("baseline", command, baseline, logPrefix);
141141
results.push({ command, baseline });
142142
}
143143

@@ -148,13 +148,20 @@ export default createScore<PreparedCheck[], ChecksConfig>({
148148

149149
return results;
150150
},
151-
evaluate: async ({ evaluation, cwd, judge, reference, config: _config }) => {
151+
evaluate: async ({
152+
evaluation,
153+
cwd,
154+
judge,
155+
reference,
156+
config: _config,
157+
logPrefix,
158+
}) => {
152159
finalizeAgentChanges(evaluation, cwd, evaluation.from);
153160

154161
for (const entry of reference) {
155162
if (!entry.after) {
156163
entry.after = await runCommand(entry.command, cwd);
157-
logExecution("after", entry.command, entry.after);
164+
logExecution("after", entry.command, entry.after, logPrefix);
158165
}
159166
}
160167

@@ -273,68 +280,49 @@ function formatExecution(execution: CommandExecution): string {
273280
return `${status} (${exitInfo}, ${duration})${error}\nstdout: ${stdout.length > 0 ? stdout : "<empty>"}\nstderr: ${stderr.length > 0 ? stderr : "<empty>"}`;
274281
}
275282

283+
function formatExecutionForLog(execution: CommandExecution): string {
284+
const status = execution.success ? "PASS" : "FAIL";
285+
const exitInfo =
286+
execution.exitCode !== null ? `exit ${execution.exitCode}` : "no exit code";
287+
const duration = `${execution.runtimeMs}ms`;
288+
const error = execution.errorMessage
289+
? ` error: ${execution.errorMessage}`
290+
: "";
291+
292+
return `${status} (${exitInfo}, ${duration})${error}`;
293+
}
294+
276295
function logExecution(
277296
stage: "baseline" | "after",
278297
command: string,
279298
execution: CommandExecution,
299+
logPrefix?: string,
280300
): void {
281301
const header =
282302
stage === "baseline" ? "[checks] Baseline" : "[checks] After agent";
283-
const formatted = formatExecution(execution);
284-
console.log(`${header} ${command}\n${formatted}\n`);
285-
286-
if (!execution.success) {
287-
const stdoutLabel =
288-
stage === "baseline"
289-
? "[checks] Baseline stdout"
290-
: "[checks] After agent stdout";
291-
const stderrLabel =
292-
stage === "baseline"
293-
? "[checks] Baseline stderr"
294-
: "[checks] After agent stderr";
295-
296-
const rawStdout = execution.stdout?.trim() ?? "";
297-
const rawStderr = execution.stderr?.trim() ?? "";
298-
299-
console.log(
300-
`${stdoutLabel} ${command}\n${rawStdout.length > 0 ? rawStdout : "<empty>"}\n`,
301-
);
302-
console.log(
303-
`${stderrLabel} ${command}\n${rawStderr.length > 0 ? rawStderr : "<empty>"}\n`,
304-
);
305-
306-
if (execution.errorMessage) {
307-
const errorLabel =
308-
stage === "baseline"
309-
? "[checks] Baseline error"
310-
: "[checks] After agent error";
311-
console.log(`${errorLabel} ${command}\n${execution.errorMessage}\n`);
312-
}
313-
}
303+
const formatted = formatExecutionForLog(execution);
304+
logLines(logPrefix, `${header} ${command}`);
305+
logLines(logPrefix, formatted);
314306
}
315307

316-
function logSetupExecution(command: string, execution: CommandExecution): void {
317-
const formatted = formatExecution(execution);
318-
console.log(`[checks] Setup ${command}\n${formatted}\n`);
319-
320-
if (!execution.success) {
321-
const stdoutLabel = `[checks] Setup stdout`;
322-
const stderrLabel = `[checks] Setup stderr`;
323-
324-
const rawStdout = execution.stdout?.trim() ?? "";
325-
const rawStderr = execution.stderr?.trim() ?? "";
326-
327-
console.log(
328-
`${stdoutLabel} ${command}\n${rawStdout.length > 0 ? rawStdout : "<empty>"}\n`,
329-
);
330-
console.log(
331-
`${stderrLabel} ${command}\n${rawStderr.length > 0 ? rawStderr : "<empty>"}\n`,
332-
);
308+
function logSetupExecution(
309+
command: string,
310+
execution: CommandExecution,
311+
logPrefix?: string,
312+
): void {
313+
const formatted = formatExecutionForLog(execution);
314+
logLines(logPrefix, `[checks] Setup ${command}`);
315+
logLines(logPrefix, formatted);
316+
}
333317

334-
if (execution.errorMessage) {
335-
console.log(
336-
`[checks] Setup error ${command}\n${execution.errorMessage}\n`,
337-
);
318+
function logLines(logPrefix: string | undefined, message: string): void {
319+
const lines = message.split("\n");
320+
for (const line of lines) {
321+
if (line.length === 0) continue;
322+
if (logPrefix) {
323+
console.log(`${logPrefix} ${line}`);
324+
} else {
325+
console.log(line);
338326
}
339327
}
340328
}

0 commit comments

Comments
 (0)