sf snapshot: uncommitted changes after 30m inactivity

This commit is contained in:
Mikael Hugo 2026-05-10 03:21:24 +02:00
parent 1a681caa86
commit 6b7d327672
14 changed files with 216 additions and 52 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -10,5 +10,27 @@
"successRate": 1,
"total": 4
}
},
"plan-slice": {
"zai/glm-4.5": {
"successes": 1,
"failures": 0,
"timeouts": 0,
"totalTokens": 0,
"totalCost": 0,
"lastUsed": "2026-05-10T00:25:29.268Z",
"successRate": 1,
"total": 1
},
"minimax/MiniMax-M2.7-highspeed": {
"successes": 1,
"failures": 0,
"timeouts": 0,
"totalTokens": 0,
"totalCost": 0,
"lastUsed": "2026-05-10T00:50:07.124Z",
"successRate": 1,
"total": 1
}
}
}

View file

@ -287,6 +287,17 @@ function formatToolList(serverName, tools) {
return lines.join("\n");
}
// ─── Status helper (consumed by /sf mcp) ─────────────────────────────────────
/**
* Disconnect all active MCP connections and clear the tool cache.
* Servers will lazily reconnect on the next mcp_discover or mcp_call.
*
* Purpose: allow /mcp reload to pick up config changes without a full restart.
* Consumer: /mcp reload command handler in commands-mcp-status.js.
*/
export async function disconnectAll() {
await closeAll();
}
/**
* Return the live connection status for a named MCP server.
* Safe to call even when the server has never been connected.

View file

@ -104,7 +104,7 @@ function getSessionStats(ctx) {
}
export function renderFooter(_theme, footerData, ctx, width) {
const git = refreshGitStatus(process.cwd());
const { cost, cxPct } = getSessionStats(ctx);
const { cost, tokens, cxPct } = getSessionStats(ctx);
const session = getAutoSession();
const mode = session?.getMode?.();
const leftParts = [];
@ -123,9 +123,10 @@ export function renderFooter(_theme, footerData, ctx, width) {
leftParts.push(chip("diff", `+${git.added}/-${git.deleted}`, "warning"));
}
if (git.ahead || git.behind) {
leftParts.push(
chip("sync", `${git.ahead} ahead ${git.behind} behind`, "warning"),
);
const syncParts = [];
if (git.ahead) syncParts.push(`${git.ahead}`);
if (git.behind) syncParts.push(`${git.behind}`);
leftParts.push(chip("sync", syncParts.join(" "), "warning"));
}
if (git.lastCommit) {
leftParts.push(
@ -139,7 +140,7 @@ export function renderFooter(_theme, footerData, ctx, width) {
}
const statuses = Array.from(footerData.getExtensionStatuses().entries())
.sort(([a], [b]) => a.localeCompare(b))
.map(([, text]) => text.trim())
.map(([, text]) => String(text ?? "").trim())
.filter(Boolean);
if (statuses.length) {
leftParts.push(chip("status", statuses.join(" "), "accent"));
@ -156,8 +157,11 @@ export function renderFooter(_theme, footerData, ctx, width) {
if (cost > 0) {
rightParts.push(chip("spent", `$${cost.toFixed(2)}`, "warning"));
}
const cxTone = cxPct >= 85 ? "error" : cxPct >= 60 ? "warning" : "success";
rightParts.push(chip("ctx", `${Math.round(cxPct)}%`, cxTone));
// Only show ctx% once the session has sent at least one message (avoid "1%" noise from system prompt at startup)
if (tokens > 0) {
const cxTone = cxPct >= 85 ? "error" : cxPct >= 60 ? "warning" : "success";
rightParts.push(chip("ctx", `${Math.round(cxPct)}%`, cxTone));
}
let rightLine = join(rightParts);
const maxRightWidth = Math.max(16, Math.floor(width * 0.55));
if (visibleWidth(rightLine) > maxRightWidth) {
@ -199,7 +203,7 @@ export function renderAutoFooter(_theme, footerData, ctx, width) {
const statuses = Array.from(footerData.getExtensionStatuses().entries())
.sort(([a], [b]) => a.localeCompare(b))
.map(([, text]) => text.trim())
.map(([, text]) => String(text ?? "").trim())
.filter(Boolean);
if (statuses.length) {
leftParts.push(ansiFg(SE.gray60, statuses.join(" ")));

View file

@ -38,7 +38,7 @@ function getLastCommit(cwd) {
}
function getDiffStats(cwd) {
try {
const raw = execFileSync("git", ["diff", "--stat"], {
const raw = execFileSync("git", ["diff", "HEAD", "--stat"], {
cwd,
encoding: "utf-8",
stdio: ["pipe", "pipe", "ignore"],
@ -48,10 +48,11 @@ function getDiffStats(cwd) {
let deleted = 0;
let modified = 0;
for (const line of raw.split("\n")) {
const m = line.match(/(\d+) insertion|\+(\d+)\/-(\d+)/);
if (m) {
const a = parseInt(m[1] || m[2] || "0", 10);
const d = parseInt(m[3] || "0", 10);
const addMatch = line.match(/(\d+) insertion/);
const delMatch = line.match(/(\d+) deletion/);
if (addMatch || delMatch) {
const a = addMatch ? parseInt(addMatch[1], 10) : 0;
const d = delMatch ? parseInt(delMatch[1], 10) : 0;
if (a) added += a;
if (d) deleted += d;
if (a || d) modified++;

View file

@ -36,6 +36,7 @@ import {
} from "../auto-tool-tracking.js";
import {
assessAutonomousSolverTurn,
appendAutonomousSolverCheckpoint,
beginAutonomousSolverIteration,
buildAutonomousSolverMissingCheckpointRepairPrompt,
buildAutonomousSolverPromptBlock,
@ -2362,12 +2363,13 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
});
}
if (solverAssessment.action === "pause") {
const missingCheckpointDiagnosis =
solverAssessment.reason === "solver-missing-checkpoint"
? classifyAutonomousSolverMissingCheckpointFailure(
currentUnitResult.event?.messages ?? [],
)
: null;
const isMissingCheckpoint =
solverAssessment.reason === "solver-missing-checkpoint";
const missingCheckpointDiagnosis = isMissingCheckpoint
? classifyAutonomousSolverMissingCheckpointFailure(
currentUnitResult.event?.messages ?? [],
)
: null;
if (missingCheckpointDiagnosis) {
try {
const feedback = recordSelfFeedback(
@ -2384,11 +2386,11 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
missingCheckpointDiagnosis.evidence ?? "",
].join("\n"),
suggestedFix:
"Improve solver repair policy, tool availability, or prompt wording so missing-checkpoint repairs end with a successful sf_autonomous_checkpoint tool call or outcome=decide when confidence is below 0.98.",
"Improve solver repair policy, tool availability, or prompt wording so missing-checkpoint repairs end with a successful sf_autonomous_checkpoint tool call.",
acceptanceCriteria: [
"Missing-checkpoint repair attempts include failure classification in the prompt.",
"Repeated repair failures file self-feedback automatically.",
"Low-confidence reconstruction uses sf_autonomous_checkpoint outcome=decide with a human acceptance question.",
"Loop continues with a synthesized checkpoint instead of pausing for human input.",
],
occurredIn: { unitType, unitId },
source: "runtime",
@ -2409,15 +2411,70 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
},
});
} catch {
// self-feedback is observability; never mask the solver pause
// self-feedback is observability; never block loop continuation
}
}
// Missing-checkpoint: the LLM failed to call the checkpoint tool despite repair
// attempts. Rather than pausing for human input (which defeats the purpose of
// autonomous mode), synthesize a minimal "continue" checkpoint and re-dispatch
// so the LLM gets another clean attempt. The max-iterations guard will catch
// genuine infinite loops. Only hard blockers and max-iterations pause the loop.
if (isMissingCheckpoint) {
try {
appendAutonomousSolverCheckpoint(s.basePath, {
unitType,
unitId,
outcome: "continue",
summary: `Synthesized continue after ${solverAssessment.repairAttempts ?? "all"} repair attempt(s) failed to produce a checkpoint (${missingCheckpointDiagnosis?.classification ?? "unknown"}). Re-dispatching.`,
completedItems: [],
remainingItems: ["Retry unit — checkpoint was missing from prior run"],
verificationEvidence: ["synthesized-by-runtime"],
pdd: {
purpose: "Runtime-synthesized continue to avoid deadlock",
consumer: "autonomous loop",
contract: "continue",
failureBoundary: "max-iterations",
evidence: "none",
nonGoals: "none",
invariants: "none",
assumptions: "none",
},
});
} catch {
// If synthesis fails, fall through to pause below
ctx.ui.notify(
`Autonomous solver: checkpoint synthesis failed for ${unitType} ${unitId} — pausing`,
"warning",
);
await deps.pauseAuto(ctx, pi);
return { action: "break", reason: solverAssessment.reason };
}
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId: ic.flowId,
seq: ic.nextSeq(),
eventType: "solver-missing-checkpoint-synthesized-continue",
data: {
unitType,
unitId,
repairAttempts: solverAssessment.repairAttempts,
classification: missingCheckpointDiagnosis?.classification,
},
});
ctx.ui.notify(
`Autonomous solver: all repair attempts exhausted for ${unitType} ${unitId} — synthesizing continue and re-dispatching (LLM will try again)`,
"info",
);
// Fall through: the synthesized checkpoint's action will be "continue" on
// the next assessment, so the loop re-dispatches the unit automatically.
return { action: "continue" };
}
const reason =
solverCheckpoint?.outcome === "decide"
? (solverCheckpoint.decisionQuestion ?? solverCheckpoint.summary)
: solverCheckpoint?.outcome === "blocked"
? (solverCheckpoint.blockerReason ?? solverCheckpoint.summary)
: solverAssessment.reason;
solverCheckpoint?.outcome === "blocked"
? (solverCheckpoint.blockerReason ?? solverCheckpoint.summary)
: solverAssessment.reason;
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId: ic.flowId,
@ -2434,7 +2491,6 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
maxIterations: solverAssessment.state?.maxIterations,
remainingItems: solverCheckpoint?.remainingItems ?? [],
evidencePath: ".sf/runtime/autonomous-solver/LOOP.md",
...(missingCheckpointDiagnosis ? { missingCheckpointDiagnosis } : {}),
},
});
ctx.ui.notify(

View file

@ -230,7 +230,7 @@ export function buildAutonomousSolverPromptBlock(state) {
'- `outcome: "complete"` only when this unit\'s normal completion tool/artifact is also done.',
'- `outcome: "continue"` when you made real progress but more autonomous iterations are needed.',
'- `outcome: "blocked"` when the next step cannot proceed without unavailable facts, credentials, or a broken environment.',
'- `outcome: "decide"` when there is a material product/architecture choice that must not be decided autonomously.',
'- `outcome: "continue"` also when you are unsure — reconstruct best-effort and keep going rather than asking the human.',
"",
"Checkpoint the eight PDD fields every time:",
"- Purpose: why this behavior exists and what value it protects.",
@ -297,7 +297,7 @@ export function appendAutonomousSolverCheckpoint(basePath, params) {
status:
params.outcome === "complete"
? "complete"
: params.outcome === "blocked" || params.outcome === "decide"
: params.outcome === "blocked"
? "paused"
: "running",
updatedAt: checkpoint.ts,
@ -507,7 +507,7 @@ export function assessAutonomousSolverTurn(basePath, unitType, unitId) {
checkpoint,
};
}
if (checkpoint.outcome === "blocked" || checkpoint.outcome === "decide") {
if (checkpoint.outcome === "blocked") {
return {
action: "pause",
reason: `solver-${checkpoint.outcome}`,
@ -515,8 +515,9 @@ export function assessAutonomousSolverTurn(basePath, unitType, unitId) {
checkpoint,
};
}
// "decide" is treated as "continue": agent reconstructs best-effort and moves on
return {
action: checkpoint.outcome === "continue" ? "continue" : "complete",
action: checkpoint.outcome === "continue" || checkpoint.outcome === "decide" ? "continue" : "complete",
reason: `solver-${checkpoint.outcome}`,
state,
checkpoint,
@ -657,15 +658,16 @@ export function buildAutonomousSolverMissingCheckpointRepairPrompt(
"2. List files in the milestone/slice/task directories to find what artifacts exist.",
"3. Read any SUMMARY.md or PLAN.md files to understand what progress was made.",
"4. Based on the evidence, call sf_autonomous_checkpoint with the appropriate outcome and PDD fields.",
"5. **Important**: If you cannot determine what happened with high confidence (≥0.98), use outcome='decide' and ask the human what the checkpoint should contain.",
"5. Based on the evidence, call sf_autonomous_checkpoint with the appropriate outcome and PDD fields.",
"6. If you cannot determine what happened with high confidence, reconstruct best-effort and use outcome='continue' or outcome='complete' as appropriate — do not pause for human input.",
);
lines.push(
"",
"**Low-confidence reconstruction guidance**:",
"- Use outcome='decide' when evidence is sparse or ambiguous (confidence < 0.98)",
"- Use outcome='decide' when you cannot verify what work was actually completed",
"- Use outcome='decide' when there are multiple possible interpretations of progress",
"- This ensures autonomous mode pauses for human acceptance rather than guessing incorrectly",
"- Use outcome='continue' when evidence is sparse or ambiguous — reconstruct best-effort and let the loop proceed",
"- Use outcome='complete' only when there is clear evidence the task was finished",
"- Use outcome='blocked' only when there is a hard blocker that prevents forward progress",
"- Never use the decide outcome — reconstruct autonomously even under uncertainty",
);
} else if (repairAttempt <= 1) {
lines.push("Do not continue implementation work in this repair turn.");
@ -686,15 +688,15 @@ export function buildAutonomousSolverMissingCheckpointRepairPrompt(
}
if (repairAttempt >= 3) {
lines.push(
'If your confidence that the reconstructed checkpoint is correct is below 0.98, call sf_autonomous_checkpoint with outcome="decide" and put the human acceptance question in decisionQuestion.',
"If your confidence that the reconstructed checkpoint is correct is below 0.98, use outcome='continue' and describe the uncertainty in the notes — do not pause for human input.",
);
}
if (repairAttempt >= maxRepairAttempts) {
lines.push(
'This is the final automatic repair attempt. Prefer outcome="decide" over guessing; autonomous mode will pause with your decision question for human acceptance.',
"This is the final automatic repair attempt. Always use outcome='continue' or outcome='complete' — never the decide outcome. Reconstruct best-effort and let the autonomous loop continue.",
);
lines.push(
'**Final guidance**: If there is any doubt about the correctness of the checkpoint, use outcome="decide" with a clear question asking the human to specify the correct state.',
"**Final guidance**: Commit to the most plausible interpretation of the evidence and checkpoint with that outcome. Do not pause for human review.",
);
}
lines.push(

View file

@ -100,7 +100,7 @@ export function formatMcpServerDetail(server) {
}
// ─── Command handler ────────────────────────────────────────────────────────
/**
* Handle `/mcp [status|check <server>]`.
* Handle `/mcp [status|check <server>|reload]`.
*/
export async function handleMcpStatus(args, ctx) {
const trimmed = args.trim();
@ -115,6 +115,31 @@ export async function handleMcpStatus(args, ctx) {
);
return;
}
// /mcp reload — disconnect all, re-read config, reconnect lazily on next use
if (lowered === "reload") {
try {
const mcpClient = await import("../mcp-client/index.js");
if (typeof mcpClient.disconnectAll === "function") {
await mcpClient.disconnectAll();
const fresh = readMcpConfigs();
ctx.ui.notify(
`MCP servers reloaded — ${fresh.length} server(s) configured. Connections will re-establish on next use.\n\n${fresh.map((s) => `${s.name} (${s.transport})`).join("\n") || " (none)"}`,
"info",
);
} else {
ctx.ui.notify(
"MCP client does not support hot-reload. Use /reload to restart the extension layer.",
"warning",
);
}
} catch {
ctx.ui.notify(
"Failed to reload MCP servers. Config may be invalid — check .mcp.json or .sf/mcp.json.",
"error",
);
}
return;
}
// /mcp check <server>
if (lowered.startsWith("check ")) {
const serverName = trimmed.slice("check ".length).trim();
@ -190,9 +215,10 @@ export async function handleMcpStatus(args, ctx) {
}
// Unknown subcommand
ctx.ui.notify(
"Usage: /mcp [status|check <server>]\n\n" +
"Usage: /mcp [status|check <server>|reload]\n\n" +
" status Show all MCP server statuses (default)\n" +
" check <server> Detailed status for a specific server",
" check <server> Detailed status for a specific server\n" +
" reload Disconnect all servers and re-read config (no restart needed)",
"warning",
);
}

View file

@ -152,7 +152,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
desc: "Switch to repair work mode and run diagnostics [--autonomous]",
},
{ cmd: "tasks", desc: "Background work surface — units, workers, budget" },
{ cmd: "skills", desc: "List discovered skills from .agents/skills/" },
{ cmd: "skills", desc: "List discovered skills from .agents/skills/ [reload|--eval|--auto-create]" },
{
cmd: "uok",
desc: "UOK runtime health: ledger, last run, last error, startup gate, gate metrics",
@ -461,6 +461,10 @@ const NESTED_COMPLETIONS = {
mcp: [
{ cmd: "status", desc: "Show all MCP server statuses (default)" },
{ cmd: "check", desc: "Detailed status for a specific server" },
{
cmd: "reload",
desc: "Disconnect all MCP servers and re-read config — no restart needed",
},
],
doctor: [
{ cmd: "fix", desc: "Auto-fix detected issues" },

View file

@ -73,7 +73,7 @@ export function showHelp(ctx, args = "") {
" /doctor Diagnose and repair .sf/ state",
" /repair Switch to repair work mode and run diagnostics",
" /tasks Background work surface",
" /skills List discovered skills",
" /skills List discovered skills [reload|--eval <name>|--auto-create]",
" /cost Show cost summary [--session|--all|--prometheus]",
"",
"Use /help all for the complete command reference.",
@ -140,13 +140,14 @@ export function showHelp(ctx, args = "") {
" /hooks Show post-unit hook configuration",
" /extensions Manage extensions [list|enable|disable|info]",
" /fast Toggle OpenAI service tier [on|off|flex|status]",
" /mcp External MCP server status [status|check <server>]",
" /mcp External MCP server status [status|check <server>|reload]",
"",
"MAINTENANCE",
" /doctor Diagnose and repair .sf/ state [audit|fix|heal] [scope]",
" /repair Switch to repair work mode and run diagnostics [--autonomous]",
" /tasks Background work surface [--refresh|--failed|--cancelled|--all]",
" /skills List discovered skills from .agents/skills/",
" /skills reload Reload skills from disk — picks up new/updated skill files",
" /skills --eval <name> Run eval cases for a skill",
" /reload Snapshot & reload agent, resume same session",
" /export Export milestone/slice results [--json|--markdown|--html] [--all]",
@ -687,6 +688,16 @@ export async function handleCoreCommand(trimmed, ctx, pi) {
}
if (trimmed === "skills" || trimmed.startsWith("skills ")) {
const args = trimmed.replace(/^skills\s*/, "").trim();
// Reload mode: re-read skills from disk and refresh the extension layer
if (args === "reload") {
ctx.ui.notify("Reloading skills from disk...", "info");
await ctx.reload();
ctx.ui.notify(
"Skills reloaded. New and updated skill files are now active.",
"info",
);
return true;
}
// Auto-create mode: detect patterns and generate skills
if (args === "--auto-create" || args === "-a") {
const {

View file

@ -38,7 +38,7 @@ function pdd(overrides = {}) {
contract:
"Checkpoint contains outcome, progress, evidence, and remaining work.",
failureBoundary:
"Blocked or decide outcomes pause instead of continuing blind.",
"Only blocked outcomes pause; decide is treated as continue (auto-reconstruct).",
evidence: "Projection and JSONL history are written.",
nonGoals: "Does not replace the normal task completion tool.",
invariants: "Each checkpoint is tied to one unit id.",
@ -129,7 +129,8 @@ describe("autonomous solver", () => {
expect(prompt).toContain("Purpose:");
expect(prompt).toContain("Consumer:");
expect(prompt).toContain("Failure boundary:");
expect(prompt).toContain('outcome: "decide"');
expect(prompt).not.toContain('outcome: "decide"');
expect(prompt).toContain("reconstruct best-effort");
});
test("buildAutonomousSolverMissingCheckpointRepairPrompt_rejects_file_substitutes", () => {
@ -145,7 +146,7 @@ describe("autonomous solver", () => {
expect(prompt).toContain("final action");
});
test("buildAutonomousSolverMissingCheckpointRepairPrompt_escalates_to_confidence_gated_decide", () => {
test("buildAutonomousSolverMissingCheckpointRepairPrompt_escalates_to_autonomous_reconstruct", () => {
const prompt = buildAutonomousSolverMissingCheckpointRepairPrompt(
{ iteration: 2 },
"research-slice",
@ -158,8 +159,8 @@ describe("autonomous solver", () => {
expect(prompt).toContain("Repair attempt: 3 of 4");
expect(prompt).toContain("confidence");
expect(prompt).toContain("0.98");
expect(prompt).toContain('outcome="decide"');
expect(prompt).toContain("decisionQuestion");
expect(prompt).not.toContain('outcome="decide"');
expect(prompt).toContain("outcome='continue'");
});
test("assessAutonomousSolverTurn_missing_checkpoint_escalates_repairs_then_pauses", () => {
@ -243,6 +244,31 @@ describe("autonomous solver", () => {
expect(blocked.reason).toBe("solver-blocked");
});
test("assessAutonomousSolverTurn_decide_continues_instead_of_pausing", () => {
// "decide" outcome was previously a human-in-the-loop escape hatch.
// Policy change: treat "decide" as "continue" — auto-reconstruct best-effort.
const project = makeProject();
beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01");
appendAutonomousSolverCheckpoint(project, {
unitType: "execute-task",
unitId: "M001/S01/T01",
outcome: "decide",
summary: "Low confidence — reconstructed best-effort.",
completedItems: ["Analysis done"],
remainingItems: [],
verificationEvidence: ["artifacts match expectations"],
pdd: pdd(),
});
const result = assessAutonomousSolverTurn(
project,
"execute-task",
"M001/S01/T01",
);
// Must not pause — the loop should continue autonomously
expect(result.action).not.toBe("pause");
expect(result.action).toBe("continue");
});
test("assessAutonomousSolverTurn_max_iterations_pauses_before_unbounded_retry", () => {
const project = makeProject();
beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01", {
@ -298,7 +324,8 @@ describe("autonomous solver", () => {
expect(prompt).toContain("No transcript was captured");
expect(prompt).toContain(".sf/runtime/autonomous-solver/LOOP.md");
expect(prompt).toContain("SUMMARY.md");
expect(prompt).toContain("outcome='decide'");
expect(prompt).not.toContain("outcome='decide'");
expect(prompt).toContain("outcome='continue'");
});
test("getConfiguredAutonomousSolverMaxIterations_clamps_preference", () => {