sf snapshot: uncommitted changes after 37m inactivity
This commit is contained in:
parent
7e8e3aa846
commit
b2bcb922de
9 changed files with 107 additions and 12 deletions
BIN
.sf/backups/db/sf.db.2026-05-10T07-56-50-335Z
Normal file
BIN
.sf/backups/db/sf.db.2026-05-10T07-56-50-335Z
Normal file
Binary file not shown.
BIN
.sf/metrics.db
BIN
.sf/metrics.db
Binary file not shown.
|
|
@ -41,6 +41,16 @@
|
|||
"lastUsed": "2026-05-10T02:32:24.115Z",
|
||||
"successRate": 1,
|
||||
"total": 3
|
||||
},
|
||||
"minimax/MiniMax-M2.7": {
|
||||
"successes": 2,
|
||||
"failures": 0,
|
||||
"timeouts": 0,
|
||||
"totalTokens": 6498562,
|
||||
"totalCost": 2.22445842,
|
||||
"lastUsed": "2026-05-10T07:26:04.505Z",
|
||||
"successRate": 1,
|
||||
"total": 2
|
||||
}
|
||||
},
|
||||
"discuss-milestone": {
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import {
|
|||
scheduleSidecarQueue,
|
||||
} from "../uok/execution-graph.js";
|
||||
import { resolveUokFlags } from "../uok/flags.js";
|
||||
import { clearRunawayRecoveredRuntimeRecords } from "../uok/unit-runtime.js";
|
||||
import { logWarning } from "../workflow-logger.js";
|
||||
import {
|
||||
COOLDOWN_FALLBACK_WAIT_MS,
|
||||
|
|
@ -422,6 +423,13 @@ async function runExitSolverEval(ctx, s, deps, iteration) {
|
|||
*/
|
||||
export async function autoLoop(ctx, pi, s, deps) {
|
||||
debugLog("autoLoop", { phase: "enter" });
|
||||
const runawayHeal = clearRunawayRecoveredRuntimeRecords(s.basePath);
|
||||
if (runawayHeal > 0) {
|
||||
debugLog("autoLoop", {
|
||||
phase: "runaway-runtime-heal",
|
||||
cleared: runawayHeal,
|
||||
});
|
||||
}
|
||||
let iteration = 0;
|
||||
// Load persisted stuck state so counters survive session restarts (#3704)
|
||||
const persisted = loadStuckState(s.basePath);
|
||||
|
|
@ -480,7 +488,10 @@ export async function autoLoop(ctx, pi, s, deps) {
|
|||
iteration,
|
||||
});
|
||||
if (s.isYolo()) {
|
||||
logWarning("dispatch", `YOLO: loop at ${iteration} iterations — continuing past safety limit`);
|
||||
logWarning(
|
||||
"dispatch",
|
||||
`YOLO: loop at ${iteration} iterations — continuing past safety limit`,
|
||||
);
|
||||
} else {
|
||||
await deps.stopAuto(
|
||||
ctx,
|
||||
|
|
@ -502,7 +513,10 @@ export async function autoLoop(ctx, pi, s, deps) {
|
|||
`Memory pressure: ${mem.heapMB}MB / ${mem.limitMB}MB (${Math.round(mem.pct * 100)}%) — stopping autonomous mode to prevent OOM kill`,
|
||||
);
|
||||
if (s.isYolo()) {
|
||||
logWarning("dispatch", "YOLO: continuing despite memory pressure — OOM risk accepted");
|
||||
logWarning(
|
||||
"dispatch",
|
||||
"YOLO: continuing despite memory pressure — OOM risk accepted",
|
||||
);
|
||||
} else {
|
||||
await deps.stopAuto(
|
||||
ctx,
|
||||
|
|
@ -694,7 +708,10 @@ export async function autoLoop(ctx, pi, s, deps) {
|
|||
});
|
||||
if (guardsResult.action === "break") {
|
||||
if (s.isYolo()) {
|
||||
logWarning("dispatch", `YOLO: bypassing guard break for ${iterData.unitId}`);
|
||||
logWarning(
|
||||
"dispatch",
|
||||
`YOLO: bypassing guard break for ${iterData.unitId}`,
|
||||
);
|
||||
} else {
|
||||
finishTurn("stopped", "manual-attention", "guard-break");
|
||||
break;
|
||||
|
|
@ -983,7 +1000,10 @@ export async function autoLoop(ctx, pi, s, deps) {
|
|||
deps.uokObserver?.onPhaseResult("guard", guardsResult.action);
|
||||
if (guardsResult.action === "break") {
|
||||
if (s.isYolo()) {
|
||||
logWarning("dispatch", `YOLO: bypassing guard break for ${iterData.unitId}`);
|
||||
logWarning(
|
||||
"dispatch",
|
||||
`YOLO: bypassing guard break for ${iterData.unitId}`,
|
||||
);
|
||||
} else {
|
||||
finishTurn("stopped", "manual-attention", "guard-break");
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -76,6 +76,7 @@ import { resolveUokFlags } from "./uok/flags.js";
|
|||
import { UokGateRunner } from "./uok/gate-runner.js";
|
||||
import { ensurePlanV2Graph as ensurePlanningFlowGraph } from "./uok/plan.js";
|
||||
import {
|
||||
clearRunawayRecoveredRuntimeRecords,
|
||||
clearUnitRuntimeRecord,
|
||||
listUnitRuntimeRecords,
|
||||
} from "./uok/unit-runtime.js";
|
||||
|
|
@ -1389,15 +1390,15 @@ async function dispatchDiscussForMilestone(
|
|||
*/
|
||||
/**
|
||||
* Self-heal: scan runtime records and clear stale ones left behind when
|
||||
* autonomous mode crashed mid-unit. auto.ts has its own selfHealRuntimeRecords()
|
||||
* but guided-flow (manual /mode) never called it — meaning stale records
|
||||
* persisted until the next /autonomous run. This ensures the workflow entry
|
||||
* starts from a clean state regardless of how the previous session ended.
|
||||
* autonomous mode crashed mid-unit. Recover `runaway-recovered` snapshots so
|
||||
* `decideUnitRuntimeDispatch` stops blocking autonomous and guided resumes.
|
||||
* Clearing at entry matches a fresh session boundary and avoids indefinite
|
||||
* `runaway-recovery-reset-required` wedges.
|
||||
*/
|
||||
function selfHealRuntimeRecords(basePath, ctx) {
|
||||
try {
|
||||
let cleared = clearRunawayRecoveredRuntimeRecords(basePath);
|
||||
const records = listUnitRuntimeRecords(basePath);
|
||||
let cleared = 0;
|
||||
for (const record of records) {
|
||||
const { unitType, unitId, phase } = record;
|
||||
// Clear records whose expected artifact already exists (completed but not cleaned up)
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import { tmpdir } from "node:os";
|
|||
import { join } from "node:path";
|
||||
import { afterEach, test } from "vitest";
|
||||
import {
|
||||
clearRunawayRecoveredRuntimeRecords,
|
||||
clearUnitRuntimeRecord,
|
||||
decideUnitRuntimeDispatch,
|
||||
getUnitRuntimeState,
|
||||
|
|
@ -244,6 +245,30 @@ test("readUnitRuntimeRecord_returns_null_for_missing", () => {
|
|||
assert.equal(record, null);
|
||||
});
|
||||
|
||||
test("clearRunawayRecoveredRuntimeRecords_clears_only_runaway_terminal_rows", () => {
|
||||
const root = makeProject();
|
||||
const t0 = Date.now();
|
||||
writeUnitRuntimeRecord(root, "discuss-milestone", "M001-X", t0, {
|
||||
status: "runaway-recovered",
|
||||
phase: "paused",
|
||||
runawayGuardPause: { reason: "test pause" },
|
||||
});
|
||||
writeUnitRuntimeRecord(root, "execute-task", "M001/S01/T01", t0, {
|
||||
status: "failed",
|
||||
phase: "timeout",
|
||||
});
|
||||
assert.equal(listUnitRuntimeRecords(root).length, 2);
|
||||
assert.equal(clearRunawayRecoveredRuntimeRecords(root), 1);
|
||||
const after = listUnitRuntimeRecords(root);
|
||||
assert.equal(after.length, 1);
|
||||
assert.equal(after[0].unitType, "execute-task");
|
||||
});
|
||||
|
||||
test("clearRunawayRecoveredRuntimeRecords_returns_zero_when_dir_missing", () => {
|
||||
const root = makeProject();
|
||||
assert.equal(clearRunawayRecoveredRuntimeRecords(root), 0);
|
||||
});
|
||||
|
||||
test("clearUnitRuntimeRecord_removes_file_and_cache", () => {
|
||||
const root = makeProject();
|
||||
writeUnitRuntimeRecord(root, "execute-task", "M001/S01/T01", Date.now(), {
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
* continue to work without changes.
|
||||
*/
|
||||
export {
|
||||
clearRunawayRecoveredRuntimeRecords,
|
||||
clearUnitRuntimeRecord,
|
||||
decideUnitRuntimeDispatch,
|
||||
formatExecuteTaskRecoveryStatus,
|
||||
|
|
|
|||
|
|
@ -198,6 +198,7 @@ export {
|
|||
} from "./unit-lineage.js";
|
||||
// ─── Unit Runtime ──────────────────────────────────────────────────────────
|
||||
export {
|
||||
clearRunawayRecoveredRuntimeRecords,
|
||||
clearUnitRuntimeRecord,
|
||||
decideUnitRuntimeDispatch,
|
||||
formatExecuteTaskRecoveryStatus,
|
||||
|
|
|
|||
|
|
@ -449,6 +449,38 @@ export function clearUnitRuntimeRecord(basePath, unitType, unitId) {
|
|||
_runtimeCache.delete(path);
|
||||
if (existsSync(path)) unlinkSync(path);
|
||||
}
|
||||
/**
|
||||
* Removes every unit-runtime file stuck in `runaway-recovered` so dispatch can resume.
|
||||
*
|
||||
* Purpose: runaway watchdog recovery leaves terminal records that
|
||||
* `decideUnitRuntimeDispatch` intentionally blocks (`runaway-recovery-reset-required`)
|
||||
* until an operator/session boundary clears them. Clearing at autonomous entry and
|
||||
* guided-flow self-heal matches treating a new launch as that boundary so workflows
|
||||
* are not wedged indefinitely.
|
||||
*
|
||||
* Consumer: autoLoop bootstrap and guided-flow `selfHealRuntimeRecords`.
|
||||
*
|
||||
* Returns: count of runtime files removed.
|
||||
*/
|
||||
export function clearRunawayRecoveredRuntimeRecords(basePath) {
|
||||
let cleared = 0;
|
||||
for (const record of listUnitRuntimeRecords(basePath)) {
|
||||
const unitType = record.unitType;
|
||||
const unitId = record.unitId;
|
||||
if (
|
||||
typeof unitType !== "string" ||
|
||||
unitType.length === 0 ||
|
||||
typeof unitId !== "string" ||
|
||||
unitId.length === 0
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
if (getUnitRuntimeState(record).status !== "runaway-recovered") continue;
|
||||
clearUnitRuntimeRecord(basePath, unitType, unitId);
|
||||
cleared++;
|
||||
}
|
||||
return cleared;
|
||||
}
|
||||
/**
|
||||
* Return all runtime records currently on disk for `basePath`.
|
||||
* Returns an empty array if the runtime directory does not exist.
|
||||
|
|
@ -489,11 +521,16 @@ export async function inspectExecuteTaskDurability(basePath, unitId) {
|
|||
if (isDbAvailable()) {
|
||||
const taskRow = getTask(mid, sid, tid);
|
||||
const taskStatus = taskRow?.status ?? "pending";
|
||||
nextActionAdvanced = taskStatus !== "pending" && taskStatus !== "in_progress";
|
||||
nextActionAdvanced =
|
||||
taskStatus !== "pending" && taskStatus !== "in_progress";
|
||||
} else {
|
||||
const stateAbs = join(sfRoot(basePath), "STATE.md");
|
||||
const stateContent = existsSync(stateAbs) ? readFileSync(stateAbs, "utf-8") : "";
|
||||
nextActionAdvanced = !new RegExp(`Execute ${escapedTid}\\b`).test(stateContent);
|
||||
const stateContent = existsSync(stateAbs)
|
||||
? readFileSync(stateAbs, "utf-8")
|
||||
: "";
|
||||
nextActionAdvanced = !new RegExp(`Execute ${escapedTid}\\b`).test(
|
||||
stateContent,
|
||||
);
|
||||
}
|
||||
// Must-have coverage: load task plan and count mentions in summary
|
||||
let mustHaveCount = 0;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue