diff --git a/packages/worker-utils/src/cloud-agent-next-client.ts b/packages/worker-utils/src/cloud-agent-next-client.ts index 6249056e87..061d1b4f4d 100644 --- a/packages/worker-utils/src/cloud-agent-next-client.ts +++ b/packages/worker-utils/src/cloud-agent-next-client.ts @@ -33,6 +33,7 @@ export type CloudAgentPrepareSessionInput = { upstreamBranch?: string; callbackTarget?: CallbackTarget; createdOnPlatform?: string; + sandboxRetryOfCloudAgentSessionId?: string; gateThreshold?: 'off' | 'all' | 'warning' | 'critical'; runtimeSkills?: Array<{ name: string; @@ -44,6 +45,7 @@ export type CloudAgentPrepareSessionInput = { export type CloudAgentPrepareSessionOutput = { cloudAgentSessionId: string; kiloSessionId: string; + sandboxRetryPrepared?: true; }; export type CloudAgentInitiateInput = { @@ -294,7 +296,26 @@ export function createCloudAgentNextFetchClient(baseUrl: string): CloudAgentNext `Unexpected prepareSession response shape: ${JSON.stringify(data).slice(0, 500)}` ); } - return data as unknown as CloudAgentPrepareSessionOutput; + if (data.sandboxRetryPrepared !== undefined && data.sandboxRetryPrepared !== true) { + throw new Error( + `Unexpected prepareSession response shape: ${JSON.stringify(data).slice(0, 500)}` + ); + } + if ( + input.sandboxRetryOfCloudAgentSessionId !== undefined && + data.sandboxRetryPrepared !== true + ) { + throw new Error('prepareSession did not acknowledge sandbox retry preparation'); + } + + const output: CloudAgentPrepareSessionOutput = { + cloudAgentSessionId: data.cloudAgentSessionId, + kiloSessionId: data.kiloSessionId, + }; + if (data.sandboxRetryPrepared === true) { + output.sandboxRetryPrepared = true; + } + return output; }, async initiateFromPreparedSession(headers, input) { diff --git a/services/cloud-agent-next/src/persistence/CloudAgentSession.ts b/services/cloud-agent-next/src/persistence/CloudAgentSession.ts index f12abfa85a..827c8707b5 100644 --- a/services/cloud-agent-next/src/persistence/CloudAgentSession.ts +++ b/services/cloud-agent-next/src/persistence/CloudAgentSession.ts @@ -617,10 +617,13 @@ export class CloudAgentSession extends DurableObject { return { status: 'inspection-failed', error: 'Session metadata unavailable' }; return createAgentSandbox(this.env, metadata).stopWrappers(request); }, - recordSharedSandboxFailover: routeKey => - this.sharedSandboxFailoverRecorder - ? this.sharedSandboxFailoverRecorder(routeKey) - : recordSharedSandboxFailover(this.env.SHARED_SANDBOX_OVERRIDES, routeKey), + recordSharedSandboxFailover: async routeKey => { + if (this.sharedSandboxFailoverRecorder) { + await this.sharedSandboxFailoverRecorder(routeKey); + return; + } + await recordSharedSandboxFailover(this.env.SHARED_SANDBOX_OVERRIDES, routeKey); + }, requestAlarmAtOrBefore: deadline => this.scheduleAlarmAtOrBefore(deadline), getSessionIdForLogs: () => this.sessionId, }); diff --git a/services/cloud-agent-next/src/router/handlers/session-prepare.ts b/services/cloud-agent-next/src/router/handlers/session-prepare.ts index ea2c557c14..c8748729ec 100644 --- a/services/cloud-agent-next/src/router/handlers/session-prepare.ts +++ b/services/cloud-agent-next/src/router/handlers/session-prepare.ts @@ -360,18 +360,24 @@ const prepareSessionHandler = internalApiProtectedProcedure userId: ctx.userId, authToken: ctx.authToken, botId: ctx.botId, + sandboxRetryOfCloudAgentSessionId: input.sandboxRetryOfCloudAgentSessionId, }) : await registerNewSession(requestWithProfile, { env: ctx.env, userId: ctx.userId, authToken: ctx.authToken, botId: ctx.botId, + sandboxRetryOfCloudAgentSessionId: input.sandboxRetryOfCloudAgentSessionId, }); - return { + const output = { cloudAgentSessionId: result.cloudAgentSessionId, kiloSessionId: result.kiloSessionId, }; + if (result.sandboxRetryPrepared === true) { + return { ...output, sandboxRetryPrepared: true }; + } + return output; }); }); diff --git a/services/cloud-agent-next/src/router/schemas.ts b/services/cloud-agent-next/src/router/schemas.ts index 1659f3328e..d89bf09899 100644 --- a/services/cloud-agent-next/src/router/schemas.ts +++ b/services/cloud-agent-next/src/router/schemas.ts @@ -441,6 +441,9 @@ export const PrepareSessionInput = z callbackTarget: CallbackTargetSchema.optional().describe( 'Optional callback target configuration for execution completion notifications' ), + sandboxRetryOfCloudAgentSessionId: sessionIdSchema + .optional() + .describe('Failed Cloud Agent session that this fresh retry must not reuse'), // Organization context kilocodeOrganizationId: z @@ -546,6 +549,7 @@ export const PrepareSessionInput = z export const PrepareSessionOutput = z.object({ cloudAgentSessionId: z.string().describe('The generated cloud-agent session ID'), kiloSessionId: z.string().describe('The Kilo CLI session ID'), + sandboxRetryPrepared: z.literal(true).optional(), }); /** diff --git a/services/cloud-agent-next/src/session-prepare.test.ts b/services/cloud-agent-next/src/session-prepare.test.ts index 70d5b7dfe3..47656e967a 100644 --- a/services/cloud-agent-next/src/session-prepare.test.ts +++ b/services/cloud-agent-next/src/session-prepare.test.ts @@ -115,6 +115,7 @@ vi.mock('./session-service.js', () => ({ import { appRouter } from './router.js'; import { profileResolutionPolicyForSessionCreateOrigin } from './router/handlers/session-prepare.js'; +import { fetchSessionMetadata } from './session-service.js'; import type { TRPCContext, SessionId } from './types.js'; function createMockDOStub( @@ -265,6 +266,7 @@ describe('prepareSession endpoint', () => { recordInternalCompensationMock.mockResolvedValue(undefined); mergeProfileConfigurationMock.mockResolvedValue({}); assertKiloModelAvailableMock.mockResolvedValue(undefined); + vi.mocked(fetchSessionMetadata).mockResolvedValue(null); }); it('rejects request without internal API key header', async () => { @@ -704,6 +706,125 @@ describe('prepareSession endpoint', () => { ); }); + it('prepares a shared sandbox retry on the failover slot', async () => { + const sourceSessionId = 'agent_00000000-0000-4000-8000-000000000001'; + const routeKey = 'usr-000000000000000000000000000000000000000000000000'; + const failoverSandboxId = 'usr-b4593afcaf2e9e1dfb1611150b786cfe8aeba3c77352a3df'; + generateSandboxRoutingTargetMock.mockResolvedValueOnce({ + kind: 'shared', + routeKey, + }); + vi.mocked(fetchSessionMetadata).mockResolvedValueOnce({ + metadataSchemaVersion: 2, + identity: { sessionId: sourceSessionId, userId: 'test-user-123' }, + auth: {}, + workspace: { + sandboxId: routeKey, + sandboxRoute: { kind: 'shared', routeKey }, + }, + lifecycle: { version: 1, timestamp: Date.now() }, + }); + const overrideStore = { + get: vi.fn().mockResolvedValue(null), + put: vi.fn().mockResolvedValue(undefined), + }; + const doStub = createMockDOStub(); + const context = createInternalApiContext({ doStub }); + Object.assign(context.env, { SHARED_SANDBOX_OVERRIDES: overrideStore }); + const caller = appRouter.createCaller(context); + + const result = await caller.prepareSession({ + prompt: 'Retry on a fresh sandbox slot', + mode: 'code', + model: 'claude-3', + githubRepo: 'acme/repo', + sandboxRetryOfCloudAgentSessionId: sourceSessionId, + }); + + expect(result).toEqual({ + cloudAgentSessionId: 'agent_12345678-1234-1234-1234-123456789abc', + kiloSessionId: 'cli-session-abc123', + sandboxRetryPrepared: true, + }); + expect(fetchSessionMetadata).toHaveBeenCalledWith( + context.env, + 'test-user-123', + sourceSessionId + ); + expect(overrideStore.get).toHaveBeenCalledWith(`shared-sandbox-route:${routeKey}`); + expect(overrideStore.put).toHaveBeenCalledWith( + `shared-sandbox-route:${routeKey}`, + 'shared-slot-v1' + ); + expect(doStub.registerSession).toHaveBeenCalledWith( + expect.objectContaining({ + workspace: { + sandboxId: failoverSandboxId, + shallow: false, + sandboxRoute: { + kind: 'shared', + routeKey, + suffix: 'shared-slot-v1', + }, + }, + }) + ); + expect(recordSandboxIdentityMock).toHaveBeenCalledWith( + expect.objectContaining({ sandboxId: failoverSandboxId }), + expect.any(Object) + ); + }); + + it('rejects a shared sandbox retry when the source already uses the failover slot', async () => { + const sourceSessionId = 'agent_00000000-0000-4000-8000-000000000001'; + const routeKey = 'usr-000000000000000000000000000000000000000000000000'; + const failoverSandboxId = 'usr-b4593afcaf2e9e1dfb1611150b786cfe8aeba3c77352a3df'; + generateSandboxRoutingTargetMock.mockResolvedValueOnce({ + kind: 'shared', + routeKey, + }); + vi.mocked(fetchSessionMetadata).mockResolvedValueOnce({ + metadataSchemaVersion: 2, + identity: { sessionId: sourceSessionId, userId: 'test-user-123' }, + auth: {}, + workspace: { + sandboxId: failoverSandboxId, + sandboxRoute: { kind: 'shared', routeKey, suffix: 'shared-slot-v1' }, + }, + lifecycle: { version: 1, timestamp: Date.now() }, + }); + const overrideStore = { + get: vi.fn().mockResolvedValue('shared-slot-v1'), + put: vi.fn().mockResolvedValue(undefined), + }; + const doStub = createMockDOStub(); + const context = createInternalApiContext({ doStub }); + Object.assign(context.env, { SHARED_SANDBOX_OVERRIDES: overrideStore }); + const caller = appRouter.createCaller(context); + + await expect( + caller.prepareSession({ + prompt: 'Do not retry from a failed failover slot', + mode: 'code', + model: 'claude-3', + githubRepo: 'acme/repo', + sandboxRetryOfCloudAgentSessionId: sourceSessionId, + }) + ).rejects.toMatchObject({ code: 'BAD_REQUEST' }); + + expect(overrideStore.get).not.toHaveBeenCalled(); + expect(overrideStore.put).not.toHaveBeenCalled(); + expect(recordSandboxIdentityMock).not.toHaveBeenCalled(); + expect(createCliSessionMock).not.toHaveBeenCalled(); + expect(doStub.registerSession).not.toHaveBeenCalled(); + expect(recordSessionFailureMock).toHaveBeenCalledWith( + expect.objectContaining({ + failure: { stage: 'sandbox_identity', code: 'sandbox_id_derivation_failed' }, + }), + expect.any(Object) + ); + }); + it('creates auto-initiated devcontainer sessions with grouped DIND sandbox intent', async () => { generateSandboxRoutingTargetMock.mockResolvedValueOnce({ kind: 'isolated', diff --git a/services/cloud-agent-next/src/session/session-registration.ts b/services/cloud-agent-next/src/session/session-registration.ts index bb8956cbe7..a1756fe7db 100644 --- a/services/cloud-agent-next/src/session/session-registration.ts +++ b/services/cloud-agent-next/src/session/session-registration.ts @@ -21,14 +21,18 @@ import type { CloudAgentSession } from '../persistence/CloudAgentSession.js'; import type { SessionMetadata } from '../persistence/session-metadata.js'; import { logger } from '../logger.js'; import { withDORetry } from '../utils/do-retry.js'; -import { generateSessionId, SessionService } from '../session-service.js'; +import { fetchSessionMetadata, generateSessionId, SessionService } from '../session-service.js'; import { createCloudAgentSessionReport, recordCloudAgentSandboxIdentity, recordCloudAgentSessionFailure, } from '../telemetry/session-reports.js'; -import { generateSandboxRoutingTarget } from '../sandbox-id.js'; -import { resolveSharedSandboxAssignment } from '../shared-sandbox-route.js'; +import { generateSandboxRoutingTarget, isGeneratedSharedSandboxId } from '../sandbox-id.js'; +import { + recordSharedSandboxFailover, + resolveSharedSandboxAssignment, + SHARED_SANDBOX_FAILOVER_SUFFIX, +} from '../shared-sandbox-route.js'; import { generateKiloSessionId } from '../utils/kilo-session-id.js'; import { createMessageId } from './message-id.js'; import type { @@ -50,6 +54,7 @@ export type SessionRegistrationContext = { userId: string; authToken: string; botId?: string; + sandboxRetryOfCloudAgentSessionId?: string; }; export type SessionRegistrationResult = { @@ -57,6 +62,7 @@ export type SessionRegistrationResult = { kiloSessionId: string; sandboxId: SandboxId; sandboxRoute?: SharedSandboxRouteMetadata; + sandboxRetryPrepared?: true; /** * Canonical initial turn reserved for a later legacy initiation request. */ @@ -116,6 +122,23 @@ type NewSessionAllocation = SessionRegistrationResult & { rollbackCliSession: () => Promise; }; +type RetrySandboxAllocation = { + sandboxId: SandboxId; + sandboxRoute?: SharedSandboxRouteMetadata; + sandboxRetryPrepared?: true; +}; + +type SandboxRetryPreparationRejection = + | 'source_metadata_unavailable' + | 'source_metadata_missing' + | 'source_sandbox_missing' + | 'source_shared_route_missing' + | 'source_shared_route_inconsistent' + | 'source_shared_already_failed_over' + | 'target_shared_route_mismatch' + | 'target_shared_failover_unavailable' + | 'target_sandbox_unchanged'; + function initialAdmissionFailure( result: Extract ): Extract { @@ -136,6 +159,141 @@ async function recordPostSetupFailure(record: () => Promise): Promise> { + let sourceMetadata: SessionMetadata | null; + try { + sourceMetadata = await fetchSessionMetadata(ctx.env, ctx.userId, sourceSessionId); + } catch (error) { + logger + .withFields({ + errorType: error instanceof Error ? error.name : typeof error, + logTag: 'sandbox_retry_source_metadata_unavailable', + }) + .warn('Failed to load source session metadata for sandbox retry'); + rejectSandboxRetryPreparation('source_metadata_unavailable'); + } + + if (!sourceMetadata) rejectSandboxRetryPreparation('source_metadata_missing'); + return getSourceSandboxForRetry(sourceMetadata); +} + +async function allocateSandboxForNewSession( + input: SessionRegistrationInput, + ctx: SessionRegistrationContext, + cloudAgentSessionId: string +): Promise { + const target = await generateSandboxRoutingTarget( + ctx.env.PER_SESSION_SANDBOX_ORG_IDS, + input.options?.kilocodeOrganizationId, + ctx.userId, + cloudAgentSessionId, + ctx.botId, + input.runtime?.devcontainer + ); + + const sourceSessionId = ctx.sandboxRetryOfCloudAgentSessionId; + if (!sourceSessionId) { + if (target.kind === 'shared') { + const assignment = await resolveSharedSandboxAssignment( + ctx.env.SHARED_SANDBOX_OVERRIDES, + target.routeKey + ); + return { + sandboxId: assignment.sandboxId, + sandboxRoute: { + kind: 'shared', + routeKey: target.routeKey, + ...(assignment.suffix ? { suffix: assignment.suffix } : {}), + }, + }; + } + return { sandboxId: target.sandboxId }; + } + + const source = await fetchSourceSandboxForRetry(ctx, sourceSessionId); + if (target.kind === 'shared') { + if (source.sandboxRoute?.routeKey !== target.routeKey) { + rejectSandboxRetryPreparation('target_shared_route_mismatch'); + } + + let assignment: Awaited>; + try { + assignment = await recordSharedSandboxFailover( + ctx.env.SHARED_SANDBOX_OVERRIDES, + target.routeKey + ); + } catch (error) { + logger + .withFields({ + errorType: error instanceof Error ? error.name : typeof error, + logTag: 'sandbox_retry_failover_unavailable', + }) + .warn('Failed to record shared sandbox failover for sandbox retry'); + rejectSandboxRetryPreparation('target_shared_failover_unavailable'); + } + if (assignment.suffix !== SHARED_SANDBOX_FAILOVER_SUFFIX) { + rejectSandboxRetryPreparation('target_shared_failover_unavailable'); + } + if (assignment.sandboxId === source.sandboxId) { + rejectSandboxRetryPreparation('target_sandbox_unchanged'); + } + + return { + sandboxId: assignment.sandboxId, + sandboxRoute: { + kind: 'shared', + routeKey: target.routeKey, + suffix: SHARED_SANDBOX_FAILOVER_SUFFIX, + }, + sandboxRetryPrepared: true, + }; + } + + if (target.sandboxId === source.sandboxId) { + rejectSandboxRetryPreparation('target_sandbox_unchanged'); + } + return { sandboxId: target.sandboxId, sandboxRetryPrepared: true }; +} + async function allocateNewSession( input: SessionRegistrationInput, ctx: SessionRegistrationContext @@ -153,29 +311,12 @@ async function allocateNewSession( let sandboxId: SandboxId; let sandboxRoute: SharedSandboxRouteMetadata | undefined; + let sandboxRetryPrepared: true | undefined; try { - const target = await generateSandboxRoutingTarget( - ctx.env.PER_SESSION_SANDBOX_ORG_IDS, - input.options?.kilocodeOrganizationId, - ctx.userId, - cloudAgentSessionId, - ctx.botId, - input.runtime?.devcontainer - ); - if (target.kind === 'shared') { - const assignment = await resolveSharedSandboxAssignment( - ctx.env.SHARED_SANDBOX_OVERRIDES, - target.routeKey - ); - sandboxId = assignment.sandboxId; - sandboxRoute = { - kind: 'shared', - routeKey: target.routeKey, - ...(assignment.suffix ? { suffix: assignment.suffix } : {}), - }; - } else { - sandboxId = target.sandboxId; - } + const allocation = await allocateSandboxForNewSession(input, ctx, cloudAgentSessionId); + sandboxId = allocation.sandboxId; + sandboxRoute = allocation.sandboxRoute; + sandboxRetryPrepared = allocation.sandboxRetryPrepared; } catch (error) { await recordCloudAgentSessionFailure( { @@ -225,6 +366,7 @@ async function allocateNewSession( kiloSessionId, sandboxId, sandboxRoute, + ...(sandboxRetryPrepared === true ? { sandboxRetryPrepared } : {}), initialTurn, sessionService, rollbackCliSession: async () => { @@ -402,6 +544,10 @@ export async function startNewSession( cloudAgentSessionId: allocation.cloudAgentSessionId, kiloSessionId: allocation.kiloSessionId, sandboxId: allocation.sandboxId, + sandboxRoute: allocation.sandboxRoute, + ...(allocation.sandboxRetryPrepared === true + ? { sandboxRetryPrepared: allocation.sandboxRetryPrepared } + : {}), admission, }; } diff --git a/services/cloud-agent-next/src/shared-sandbox-route.ts b/services/cloud-agent-next/src/shared-sandbox-route.ts index 3e75c3d3a5..1ecf528141 100644 --- a/services/cloud-agent-next/src/shared-sandbox-route.ts +++ b/services/cloud-agent-next/src/shared-sandbox-route.ts @@ -74,16 +74,21 @@ export async function resolveSharedSandboxAssignment( export async function recordSharedSandboxFailover( store: SharedSandboxOverrideStore, routeKey: SandboxId -): Promise { +): Promise { if (!isGeneratedSharedSandboxId(routeKey)) { throw new Error('Shared sandbox route key must be a generated shared sandbox ID'); } const key = routeOverrideKey(routeKey); const existing = await readRouteOverride(store, key); - if (existing === SHARED_SANDBOX_FAILOVER_SUFFIX) return; - if (existing !== null) { + if (existing !== null && existing !== SHARED_SANDBOX_FAILOVER_SUFFIX) { throw new Error('Invalid shared sandbox override'); } - await writeRouteOverride(store, key, SHARED_SANDBOX_FAILOVER_SUFFIX); + if (existing === null) { + await writeRouteOverride(store, key, SHARED_SANDBOX_FAILOVER_SUFFIX); + } + return { + sandboxId: await deriveSharedSandboxId(routeKey, SHARED_SANDBOX_FAILOVER_SUFFIX), + suffix: SHARED_SANDBOX_FAILOVER_SUFFIX, + }; } diff --git a/services/code-review-infra/src/code-review-orchestrator.ts b/services/code-review-infra/src/code-review-orchestrator.ts index f3aa75d26b..52d0721750 100644 --- a/services/code-review-infra/src/code-review-orchestrator.ts +++ b/services/code-review-infra/src/code-review-orchestrator.ts @@ -492,7 +492,8 @@ export class CodeReviewOrchestrator extends DurableObject { private async tryRetryFreshSessionAfterSandboxError( source: string, error: unknown, - classification: CloudAgentNextFreshRetryClassification + classification: CloudAgentNextFreshRetryClassification, + failedCloudAgentSessionId?: string ): Promise { if (this.state.sandboxRetryAttempted === true) { this.logCloudAgentNextFreshSessionRetrySkipped( @@ -531,6 +532,7 @@ export class CodeReviewOrchestrator extends DurableObject { const previousSandboxId = this.state.sandboxId; this.state.sandboxRetryAttempted = true; + this.state.sandboxRetryOfCloudAgentSessionId = failedCloudAgentSessionId; this.state.previousCloudAgentSessionId = undefined; this.state.sessionId = undefined; this.state.cliSessionId = undefined; @@ -551,6 +553,7 @@ export class CodeReviewOrchestrator extends DurableObject { previousSessionId, previousCliSessionId, previousSandboxId, + failedCloudAgentSessionId, sandboxRetryAttempted: true, retryOutcome: 'scheduled', retryDelayMs, @@ -959,6 +962,7 @@ export class CodeReviewOrchestrator extends DurableObject { skipBalanceCheck?: boolean; agentVersion?: string; previousCloudAgentSessionId?: string; + sandboxRetryOfCloudAgentSessionId?: string; repositorySize?: string | null; runReviewDelayMs?: number; }): Promise<{ status: CodeReviewStatus }> { @@ -986,6 +990,8 @@ export class CodeReviewOrchestrator extends DurableObject { skipBalanceCheck: params.skipBalanceCheck, agentVersion: params.agentVersion, previousCloudAgentSessionId: params.previousCloudAgentSessionId, + sandboxRetryOfCloudAgentSessionId: params.sandboxRetryOfCloudAgentSessionId, + sandboxRetryAttempted: params.sandboxRetryOfCloudAgentSessionId ? true : undefined, repositorySize: params.repositorySize, }; await this.saveState(); @@ -1064,7 +1070,7 @@ export class CodeReviewOrchestrator extends DurableObject { return false; } - if (params.sessionId && this.state.sessionId && params.sessionId !== this.state.sessionId) { + if (!params.sessionId || !this.state.sessionId || params.sessionId !== this.state.sessionId) { console.warn( '[CodeReviewOrchestrator] retryFreshAfterInfraFailure ignored session mismatch', { @@ -1097,6 +1103,7 @@ export class CodeReviewOrchestrator extends DurableObject { skipBalanceCheck: this.state.skipBalanceCheck, agentVersion: this.state.agentVersion, previousCloudAgentSessionId: undefined, + sandboxRetryOfCloudAgentSessionId: params.sessionId, repositorySize: this.state.repositorySize, runReviewDelayMs: retryDelayMs, }); @@ -1257,6 +1264,7 @@ export class CodeReviewOrchestrator extends DurableObject { private async runWithCloudAgentNext(): Promise { const runStartTime = Date.now(); const client = this.getCloudAgentNextClient(); + let preparedCloudAgentSessionId: string | undefined; try { const statusUpdateResult = await this.updateStatus('running'); @@ -1297,6 +1305,7 @@ export class CodeReviewOrchestrator extends DurableObject { runtimeSkills: githubCloudReviewSkillAttached ? [GITHUB_CLOUD_REVIEW_SKILL] : undefined, createdOnPlatform: 'code-review' as const, callbackTarget, + sandboxRetryOfCloudAgentSessionId: this.state.sandboxRetryOfCloudAgentSessionId, }; console.log('[CodeReviewOrchestrator] Calling prepareSession', { @@ -1315,6 +1324,7 @@ export class CodeReviewOrchestrator extends DurableObject { internalHeaders, prepareInput ); + preparedCloudAgentSessionId = cloudAgentSessionId; const repositorySize = this.state.repositorySize ?? null; @@ -1373,7 +1383,8 @@ export class CodeReviewOrchestrator extends DurableObject { await this.tryRetryFreshSessionAfterSandboxError( 'cloud-agent-next-fresh', error, - retryClassification + retryClassification, + preparedCloudAgentSessionId ) ) { return; @@ -1575,7 +1586,8 @@ export class CodeReviewOrchestrator extends DurableObject { await this.tryRetryFreshSessionAfterSandboxError( 'cloud-agent-next-followup', error, - retryClassification + retryClassification, + previousSessionId ) ) { return; diff --git a/services/code-review-infra/src/types.ts b/services/code-review-infra/src/types.ts index fe0781a06a..a9e0fc4850 100644 --- a/services/code-review-infra/src/types.ts +++ b/services/code-review-infra/src/types.ts @@ -71,6 +71,8 @@ export interface CodeReview { agentVersion?: string; /** Cloud-agent session ID from a previous completed review, for session continuation */ previousCloudAgentSessionId?: string; + /** Failed Cloud Agent session ID that this fresh retry must avoid reusing at the sandbox layer. */ + sandboxRetryOfCloudAgentSessionId?: string; sandboxRetryAttempted?: boolean; /** Provider-reported repository storage size, formatted for log correlation. */ repositorySize?: string | null; @@ -141,6 +143,8 @@ export interface CodeReviewRequest { agentVersion?: string; /** Cloud-agent session ID from a previous completed review, for session continuation */ previousCloudAgentSessionId?: string; + /** Failed Cloud Agent session ID that this fresh retry must avoid reusing at the sandbox layer. */ + sandboxRetryOfCloudAgentSessionId?: string; /** Provider-reported repository storage size, formatted for log correlation. */ repositorySize?: string | null; } diff --git a/services/code-review-infra/test/integration/code-review-orchestrator.test.ts b/services/code-review-infra/test/integration/code-review-orchestrator.test.ts index d1fdd30412..6f8487f4d9 100644 --- a/services/code-review-infra/test/integration/code-review-orchestrator.test.ts +++ b/services/code-review-infra/test/integration/code-review-orchestrator.test.ts @@ -523,6 +523,7 @@ describe('CodeReviewOrchestrator recovery', () => { return trpcSuccess({ cloudAgentSessionId: 'agent-retry-fresh', kiloSessionId: 'ses_retry_fresh', + sandboxRetryPrepared: true, }); } if (url.includes('/trpc/initiateFromKilocodeSessionV2')) { @@ -577,6 +578,7 @@ describe('CodeReviewOrchestrator recovery', () => { return trpcSuccess({ cloudAgentSessionId: 'agent-retry-fresh', kiloSessionId: 'ses_retry_fresh', + sandboxRetryPrepared: true, }); } if (url.includes('/trpc/initiateFromKilocodeSessionV2')) { @@ -629,6 +631,8 @@ describe('CodeReviewOrchestrator recovery', () => { }); await expect(storedReview(retryStub)).resolves.toMatchObject({ repositorySize: '100 MB', + sandboxRetryOfCloudAgentSessionId: 'agent-old', + sandboxRetryAttempted: true, }); const retryAlarm = await storedAlarm(retryStub); expectAutoRetryAlarmInRange(retryAlarm, retrySchedulingStartedAt); @@ -644,6 +648,13 @@ describe('CodeReviewOrchestrator recovery', () => { sessionId: 'agent-retry-fresh', cliSessionId: 'ses_retry_fresh', }); + const prepareCall = getFetchCall(fetchMock, '/trpc/prepareSession'); + const prepareBody = JSON.parse(String(prepareCall?.[1]?.body)); + expect(prepareBody.sandboxRetryOfCloudAgentSessionId).toBe('agent-old'); + expect(prepareBody).not.toHaveProperty('previousCloudAgentSessionId'); + expect(fetchCalls(fetchMock, '/trpc/getSessionHealth')).toHaveLength(0); + expect(fetchCalls(fetchMock, '/trpc/updateSession')).toHaveLength(0); + expect(fetchCalls(fetchMock, '/trpc/sendMessageV2')).toHaveLength(0); }); it('retry-fresh ignores mismatched sessions', async () => { @@ -808,6 +819,78 @@ describe('CodeReviewOrchestrator recovery', () => { expect(failedStatusUpdates).toHaveLength(0); }); + it('retries initiateFromPreparedSession with the failed session pinned', async () => { + const stub = getReviewStub(); + const failedSessionId = 'agent-failed-initiate'; + const retrySessionId = 'agent-retry-after-initiate'; + let prepareCalls = 0; + let initiateCalls = 0; + const fetchMock = vi.fn(async (request: RequestInfo | URL) => { + const url = String(request); + if (url.includes('/api/internal/code-review-status/')) { + return Response.json({ success: true }); + } + if (url.includes('/trpc/prepareSession')) { + prepareCalls += 1; + if (prepareCalls === 1) { + return trpcSuccess({ + cloudAgentSessionId: failedSessionId, + kiloSessionId: 'ses_failed_initiate', + }); + } + return trpcSuccess({ + cloudAgentSessionId: retrySessionId, + kiloSessionId: 'ses_retry_after_initiate', + sandboxRetryPrepared: true, + }); + } + if (url.includes('/trpc/initiateFromKilocodeSessionV2')) { + initiateCalls += 1; + if (initiateCalls === 1) { + return trpcError(500, 'SandboxError: HTTP error! status: 500 during launch'); + } + return trpcSuccess({ executionId: 'exec-retry-after-initiate', status: 'running' }); + } + return new Response('unexpected fetch', { status: 500 }); + }); + globalThis.fetch = fetchMock; + + await runInDurableObject(stub, async (_instance: CodeReviewOrchestrator, state) => { + await state.storage.put('state', codeReview()); + await state.storage.setAlarm(Date.now() + 30_000); + }); + + const retrySchedulingStartedAt = Date.now(); + const ran = await runDurableObjectAlarm(stub); + + expect(ran).toBe(true); + await expect(stub.status()).resolves.toMatchObject({ status: 'queued' }); + expect(fetchCalls(fetchMock, '/trpc/prepareSession')).toHaveLength(1); + expect(fetchCalls(fetchMock, '/trpc/initiateFromKilocodeSessionV2')).toHaveLength(1); + await expectAutoRetryScheduled(stub, retrySchedulingStartedAt); + await expect(storedReview(stub)).resolves.toMatchObject({ + status: 'queued', + sandboxRetryAttempted: true, + sandboxRetryOfCloudAgentSessionId: failedSessionId, + }); + + const retryRan = await runDurableObjectAlarm(stub); + expect(retryRan).toBe(true); + await expect(stub.status()).resolves.toMatchObject({ + status: 'running', + sessionId: retrySessionId, + cliSessionId: 'ses_retry_after_initiate', + }); + expect(fetchCalls(fetchMock, '/trpc/prepareSession')).toHaveLength(2); + expect(fetchCalls(fetchMock, '/trpc/initiateFromKilocodeSessionV2')).toHaveLength(2); + + const prepareFetchCalls = fetchCalls(fetchMock, '/trpc/prepareSession'); + const firstPrepareBody = JSON.parse(String(prepareFetchCalls[0]?.[1]?.body)); + const retryPrepareBody = JSON.parse(String(prepareFetchCalls[1]?.[1]?.body)); + expect(firstPrepareBody).not.toHaveProperty('sandboxRetryOfCloudAgentSessionId'); + expect(retryPrepareBody.sandboxRetryOfCloudAgentSessionId).toBe(failedSessionId); + }); + it('retries prepareSession from a structured workspace mkdir retry marker', async () => { const stub = getReviewStub(); let prepareCalls = 0; @@ -1963,6 +2046,7 @@ describe('CodeReviewOrchestrator recovery', () => { return trpcSuccess({ cloudAgentSessionId: 'agent-fresh-after-sandbox-500', kiloSessionId: 'ses_fresh_after_sandbox_500', + sandboxRetryPrepared: true, }); } if (url.includes('/trpc/initiateFromKilocodeSessionV2')) { @@ -1995,6 +2079,11 @@ describe('CodeReviewOrchestrator recovery', () => { expect(fetchCalls(fetchMock, '/trpc/prepareSession')).toHaveLength(0); expect(fetchCalls(fetchMock, '/trpc/initiateFromKilocodeSessionV2')).toHaveLength(0); await expectAutoRetryScheduled(stub, retrySchedulingStartedAt); + await expect(storedReview(stub)).resolves.toMatchObject({ + status: 'queued', + sandboxRetryAttempted: true, + sandboxRetryOfCloudAgentSessionId: previousSessionId, + }); const retryRan = await runDurableObjectAlarm(stub); expect(retryRan).toBe(true); @@ -2009,6 +2098,10 @@ describe('CodeReviewOrchestrator recovery', () => { expect(fetchCalls(fetchMock, '/trpc/prepareSession')).toHaveLength(1); expect(fetchCalls(fetchMock, '/trpc/initiateFromKilocodeSessionV2')).toHaveLength(1); + const prepareCall = getFetchCall(fetchMock, '/trpc/prepareSession'); + const prepareBody = JSON.parse(String(prepareCall?.[1]?.body)); + expect(prepareBody.sandboxRetryOfCloudAgentSessionId).toBe(previousSessionId); + const stored = await storedReview(stub); expect(stored).toMatchObject({ sandboxRetryAttempted: true, @@ -2065,6 +2158,11 @@ describe('CodeReviewOrchestrator recovery', () => { expect(fetchCalls(fetchMock, '/trpc/prepareSession')).toHaveLength(0); expect(fetchCalls(fetchMock, '/trpc/initiateFromKilocodeSessionV2')).toHaveLength(0); await expectAutoRetryScheduled(stub, retrySchedulingStartedAt); + await expect(storedReview(stub)).resolves.toMatchObject({ + status: 'queued', + sandboxRetryAttempted: true, + sandboxRetryOfCloudAgentSessionId: previousSessionId, + }); const retryRan = await runDurableObjectAlarm(stub); expect(retryRan).toBe(true); @@ -2075,6 +2173,9 @@ describe('CodeReviewOrchestrator recovery', () => { expect(fetchCalls(fetchMock, '/trpc/sendMessageV2')).toHaveLength(1); expect(fetchCalls(fetchMock, '/trpc/prepareSession')).toHaveLength(1); expect(fetchCalls(fetchMock, '/trpc/initiateFromKilocodeSessionV2')).toHaveLength(0); + const prepareCall = getFetchCall(fetchMock, '/trpc/prepareSession'); + const prepareBody = JSON.parse(String(prepareCall?.[1]?.body)); + expect(prepareBody.sandboxRetryOfCloudAgentSessionId).toBe(previousSessionId); }); it('aborts alarm recovery before cloud-agent calls when DB is already terminal', async () => {