fix(auto): repair retries reuse session context instead of starting cold
When the autonomous solver fails to produce a checkpoint and enters the repair loop, subsequent retries previously called newSession() each time, wiping the conversation history. The agent restarted cold with no memory of what it had tried, what tools it had called, or why it failed — making meaningful repair nearly impossible. This change adds a keepSession option to runUnit(). When true, the newSession() call and session-switch guard logic are skipped; the repair prompt is sent as a follow-up in the existing conversation. The agent can now see its prior tool calls, file reads, and failure context when deciding how to fix the issue. Policy: - First attempt at each unit: keepSession=false (clean session, correct for independent slice boundaries — system prompt carries project state) - Repair retries within the same unit: keepSession=true (agent carries full context of what it already tried) - Next unit after success/failure: keepSession=false (clean boundary) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
parent
b464f2a78e
commit
1a681caa86
2 changed files with 84 additions and 68 deletions
|
|
@ -2336,6 +2336,7 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
|
|||
solverAssessment.repairAttempt,
|
||||
solverAssessment.maxRepairAttempts,
|
||||
),
|
||||
{ keepSession: true },
|
||||
);
|
||||
s.lastUnitAgentEndMessages = currentUnitResult.event?.messages ?? null;
|
||||
if (currentUnitResult.status === "cancelled") {
|
||||
|
|
|
|||
|
|
@ -37,9 +37,16 @@ let sessionSwitchGeneration = 0;
|
|||
* The promise is one-shot: resolveAgentEnd() is the only way to resolve it.
|
||||
* On session creation failure or timeout, returns { status: 'cancelled' }
|
||||
* without awaiting the promise.
|
||||
*
|
||||
* Options:
|
||||
* keepSession — when true, skip newSession() and continue in the current
|
||||
* conversation. Used for repair retries within the same unit so the agent
|
||||
* sees what it tried before and why it failed, rather than starting cold.
|
||||
* Default: false (each new unit starts with a clean session).
|
||||
*/
|
||||
export async function runUnit(ctx, pi, s, unitType, unitId, prompt) {
|
||||
debugLog("runUnit", { phase: "start", unitType, unitId });
|
||||
export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) {
|
||||
const keepSession = options?.keepSession === true;
|
||||
debugLog("runUnit", { phase: "start", unitType, unitId, keepSession });
|
||||
// GAP-10: Ensure cwd matches basePath BEFORE newSession() captures it. The
|
||||
// new session reads process.cwd() during construction to anchor its tool
|
||||
// runtime and system prompt; if cwd has drifted (async_bash, background
|
||||
|
|
@ -62,70 +69,75 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt) {
|
|||
},
|
||||
};
|
||||
}
|
||||
// ── Session creation with timeout ──
|
||||
debugLog("runUnit", { phase: "session-create", unitType, unitId });
|
||||
let sessionResult;
|
||||
let sessionTimeoutHandle;
|
||||
const mySessionSwitchGeneration = ++sessionSwitchGeneration;
|
||||
// GAP-07: Cancellation controller for newSession(). When the session-creation
|
||||
// timeout fires, we abort this controller so that any still-in-flight
|
||||
// newSession() work (which may clobber process.cwd()) is signalled to stop.
|
||||
// Note: SF's newSession() does not currently accept abortSignal in its
|
||||
// options type, so we cannot pass it directly — but aborting the controller
|
||||
// documents the intent clearly and is a no-op call site when the API adds it.
|
||||
const sessionAbortController = new AbortController();
|
||||
_setSessionSwitchInFlight(true);
|
||||
try {
|
||||
const sessionPromise = s.cmdCtx.newSession().finally(() => {
|
||||
if (sessionSwitchGeneration === mySessionSwitchGeneration) {
|
||||
_setSessionSwitchInFlight(false);
|
||||
}
|
||||
});
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
sessionTimeoutHandle = setTimeout(() => {
|
||||
sessionAbortController.abort();
|
||||
resolve({ cancelled: true });
|
||||
}, NEW_SESSION_TIMEOUT_MS);
|
||||
});
|
||||
sessionResult = await Promise.race([sessionPromise, timeoutPromise]);
|
||||
} catch (sessionErr) {
|
||||
// ── Session creation (skipped for same-unit repair retries) ──
|
||||
// keepSession=true: send the repair prompt into the existing conversation so
|
||||
// the agent has full context of what it already tried and why it failed.
|
||||
// keepSession=false (default): start a clean session for each new unit.
|
||||
if (!keepSession) {
|
||||
debugLog("runUnit", { phase: "session-create", unitType, unitId });
|
||||
let sessionResult;
|
||||
let sessionTimeoutHandle;
|
||||
const mySessionSwitchGeneration = ++sessionSwitchGeneration;
|
||||
// GAP-07: Cancellation controller for newSession(). When the session-creation
|
||||
// timeout fires, we abort this controller so that any still-in-flight
|
||||
// newSession() work (which may clobber process.cwd()) is signalled to stop.
|
||||
// Note: SF's newSession() does not currently accept abortSignal in its
|
||||
// options type, so we cannot pass it directly — but aborting the controller
|
||||
// documents the intent clearly and is a no-op call site when the API adds it.
|
||||
const sessionAbortController = new AbortController();
|
||||
_setSessionSwitchInFlight(true);
|
||||
try {
|
||||
const sessionPromise = s.cmdCtx.newSession().finally(() => {
|
||||
if (sessionSwitchGeneration === mySessionSwitchGeneration) {
|
||||
_setSessionSwitchInFlight(false);
|
||||
}
|
||||
});
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
sessionTimeoutHandle = setTimeout(() => {
|
||||
sessionAbortController.abort();
|
||||
resolve({ cancelled: true });
|
||||
}, NEW_SESSION_TIMEOUT_MS);
|
||||
});
|
||||
sessionResult = await Promise.race([sessionPromise, timeoutPromise]);
|
||||
} catch (sessionErr) {
|
||||
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
|
||||
const msg =
|
||||
sessionErr instanceof Error ? sessionErr.message : String(sessionErr);
|
||||
debugLog("runUnit", {
|
||||
phase: "session-error",
|
||||
unitType,
|
||||
unitId,
|
||||
error: msg,
|
||||
});
|
||||
return {
|
||||
status: "cancelled",
|
||||
errorContext: {
|
||||
message: `Session creation failed: ${msg}`,
|
||||
category: "session-failed",
|
||||
isTransient: true,
|
||||
},
|
||||
};
|
||||
}
|
||||
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
|
||||
const msg =
|
||||
sessionErr instanceof Error ? sessionErr.message : String(sessionErr);
|
||||
debugLog("runUnit", {
|
||||
phase: "session-error",
|
||||
unitType,
|
||||
unitId,
|
||||
error: msg,
|
||||
});
|
||||
return {
|
||||
status: "cancelled",
|
||||
errorContext: {
|
||||
message: `Session creation failed: ${msg}`,
|
||||
category: "session-failed",
|
||||
isTransient: true,
|
||||
},
|
||||
};
|
||||
}
|
||||
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
|
||||
if (sessionResult.cancelled) {
|
||||
debugLog("runUnit-session-timeout", { unitType, unitId });
|
||||
// On timeout, do NOT clear the in-flight guard here. The dangling
|
||||
// sessionPromise's .finally() has a generation check — it will clear the
|
||||
// guard when the underlying newSession promise eventually settles, but only
|
||||
// if no newer runUnit call has already incremented the generation. This is
|
||||
// the correct design: the guard stays true until the next session is ready,
|
||||
// preventing stale agent_end events from the timed-out session from being
|
||||
// processed by handleAgentEnd. The next runUnit call sets inFlight=true
|
||||
// again and its own .finally() manages the clearing.
|
||||
return {
|
||||
status: "cancelled",
|
||||
errorContext: {
|
||||
message: "Session creation timed out",
|
||||
category: "timeout",
|
||||
isTransient: true,
|
||||
},
|
||||
};
|
||||
if (sessionResult.cancelled) {
|
||||
debugLog("runUnit-session-timeout", { unitType, unitId });
|
||||
// On timeout, do NOT clear the in-flight guard here. The dangling
|
||||
// sessionPromise's .finally() has a generation check — it will clear the
|
||||
// guard when the underlying newSession promise eventually settles, but only
|
||||
// if no newer runUnit call has already incremented the generation. This is
|
||||
// the correct design: the guard stays true until the next session is ready,
|
||||
// preventing stale agent_end events from the timed-out session from being
|
||||
// processed by handleAgentEnd. The next runUnit call sets inFlight=true
|
||||
// again and its own .finally() manages the clearing.
|
||||
return {
|
||||
status: "cancelled",
|
||||
errorContext: {
|
||||
message: "Session creation timed out",
|
||||
category: "timeout",
|
||||
isTransient: true,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
if (!s.active) {
|
||||
return { status: "cancelled" };
|
||||
|
|
@ -150,9 +162,12 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt) {
|
|||
}
|
||||
}
|
||||
// ── Create the agent_end promise (per-unit one-shot) ──
|
||||
// This happens after newSession completes so session-switch agent_end events
|
||||
// from the previous session cannot resolve the new unit.
|
||||
_setSessionSwitchInFlight(false);
|
||||
// When keepSession=false: clear the in-flight guard now that the new session
|
||||
// is fully ready, so handleAgentEnd processes events for this unit only.
|
||||
// When keepSession=true: no session switch happened, guard is already clear.
|
||||
if (!keepSession) {
|
||||
_setSessionSwitchInFlight(false);
|
||||
}
|
||||
const unitPromise = new Promise((resolve) => {
|
||||
_setCurrentResolve(resolve);
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue