sf snapshot: uncommitted changes after 36m inactivity

This commit is contained in:
Mikael Hugo 2026-05-10 04:27:43 +02:00
parent 1a0222fc71
commit 01d58c570d
14 changed files with 162 additions and 79 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -23,14 +23,26 @@
"total": 1
},
"minimax/MiniMax-M2.7-highspeed": {
"successes": 1,
"successes": 2,
"failures": 0,
"timeouts": 0,
"totalTokens": 0,
"totalCost": 0,
"lastUsed": "2026-05-10T00:50:07.124Z",
"totalTokens": 891034,
"totalCost": 0.20030757,
"lastUsed": "2026-05-10T01:24:00.207Z",
"successRate": 1,
"total": 1
"total": 2
}
},
"discuss-milestone": {
"minimax/MiniMax-M2.7-highspeed": {
"successes": 2,
"failures": 0,
"timeouts": 0,
"totalTokens": 8639600,
"totalCost": 2.0647307100000005,
"lastUsed": "2026-05-10T01:43:48.671Z",
"successRate": 1,
"total": 2
}
}
}

View file

@ -167,8 +167,8 @@ function formatSolverWidgetLine(basePath, theme, width, pad) {
.join(" · ");
return truncateToWidth(`${pad}${theme.fg("dim", text)}`, width, "…");
}
function formatUokDiagnosticWidgetLine(basePath, theme, width, pad) {
const diagnostics = readUokDiagnostics(basePath);
function formatUokDiagnosticWidgetLine(basePath, theme, width, pad, cachedDiagnostics) {
const diagnostics = cachedDiagnostics !== undefined ? cachedDiagnostics : readUokDiagnostics(basePath);
if (!diagnostics) return null;
const parts = [
`uok ${diagnostics.verdict ?? "unknown"}`,
@ -607,6 +607,11 @@ export function updateProgressWidget(
let cachedLines;
let cachedWidth;
let cachedRtkLabel;
// Cache health score and UOK diagnostics at 15s interval — recomputing
// them on every 1s spinner tick causes the widget height to change whenever
// the score level transitions, making the banner "bounce" on screen.
let cachedProgressScore = computeProgressScore();
let cachedUokDiagnostics = readUokDiagnostics(accessors.getBasePath());
let activityFrame = 0;
const refreshRtkLabel = () => {
try {
@ -634,6 +639,9 @@ export function updateProgressWidget(
updateSliceProgressCache(accessors.getBasePath(), mid.id, slice?.id);
}
refreshRtkLabel();
// Refresh health score and diagnostics alongside other slow data
cachedProgressScore = computeProgressScore();
cachedUokDiagnostics = readUokDiagnostics(accessors.getBasePath());
cachedLines = undefined;
} catch (err) {
/* non-fatal */
@ -667,8 +675,9 @@ export function updateProgressWidget(
const spinner = theme.fg("accent", ACTIVITY_FRAMES[activityFrame]);
const elapsed = formatAutoElapsed(accessors.getAutoStartTime());
const modeTag = accessors.isStepMode() ? "NEXT" : "AUTO";
// Health indicator in header
const score = computeProgressScore();
// Health indicator in header — use 15s-cached score (not live)
// to keep widget height stable between refreshes.
const score = cachedProgressScore;
const healthColor =
score.level === "green"
? "success"
@ -782,6 +791,7 @@ export function updateProgressWidget(
theme,
width,
pad,
cachedUokDiagnostics,
);
if (diagnosticLine) lines.push(diagnosticLine);
// Progress bar
@ -883,6 +893,7 @@ export function updateProgressWidget(
theme,
width,
pad,
cachedUokDiagnostics,
);
if (diagnosticLine) lines.push(diagnosticLine);
lines.push("");

View file

@ -8,6 +8,7 @@ import {
hasResearchTerminalTransition,
isAutoActive,
isAutoPaused,
isCanAskUser,
markResearchTerminalTransition,
markToolEnd,
markToolStart,
@ -730,6 +731,10 @@ export function registerHooks(pi, ecosystemHandlers = []) {
// ── Discussion gate enforcement: track pending gate questions ─────────
// Only gate-shaped ask_user_questions calls should block execution.
// The gate stays pending until the user selects the approval option.
// In autonomous mode (canAskUser=false): block the gate question and
// instruct the agent to investigate and resolve its own uncertainty
// rather than waiting for human input. The question exists because
// confidence is low — the agent must research until it can decide.
if (event.toolName === "ask_user_questions") {
const questions = event.input?.questions ?? [];
const questionId = questions.find(
@ -737,28 +742,61 @@ export function registerHooks(pi, ecosystemHandlers = []) {
typeof question?.id === "string" && isGateQuestionId(question.id),
)?.id;
if (typeof questionId === "string") {
if (isAutoActive() && !isCanAskUser()) {
// Do not set a pending gate — the agent must self-resolve.
// The gate was triggered because confidence < 0.98. The agent
// must investigate until confidence reaches 0.98, then proceed.
const questionSummary =
questions.find((q) => q.id === questionId)?.question ?? questionId;
return {
block: true,
reason:
`AUTONOMOUS MODE — gate question blocked. You called ask_user_questions ` +
`because your confidence was below 0.98. Target: reach ≥ 0.98 confidence ` +
`through investigation, then proceed without pausing.\n\n` +
`Gate question (self-resolve this): "${questionSummary}"\n\n` +
`Confidence-building protocol (execute in order, stop when confidence ≥ 0.98):\n` +
`1. Tier 1 — Read artifacts: task plans, slice plans, .sf/DECISIONS.md, ` +
`.sf/REQUIREMENTS.md, .sf/CONTEXT.md, docs/records/, existing test coverage.\n` +
`2. Tier 1 — Run verification: lint, type-check, relevant tests. Read the output.\n` +
`3. Tier 2 — External lookup: WebSearch / WebFetch / Context7 for any factual ` +
`unknowns (API behavior, version compatibility, library defaults).\n` +
`4. Self-assess: rate your confidence 01. If ≥ 0.98 → proceed. ` +
`If still < 0.98 → there is a real, concrete blocker.\n` +
`5. Concrete blocker only: call sf_autonomous_checkpoint with outcome="blocked" ` +
`and a precise blockerReason naming the specific unresolvable gap ` +
`(missing credentials, impossible contract, safety violation, legal scope).\n\n` +
`Do NOT call ask_user_questions again. Do NOT pause for user input. ` +
`Factual gaps are YOUR job to close via Tier 1 and Tier 2.`,
};
}
setPendingGate(questionId);
}
}
// ── Discussion gate enforcement: block tool calls while gate is pending ──
// If ask_user_questions was called with a gate ID but hasn't been confirmed,
// block all non-read-only tool calls to prevent the model from skipping gates.
// In autonomous mode: auto-clear any stale pending gate so it never blocks.
if (getPendingGate()) {
const milestoneId = getDiscussionMilestoneId(discussionBasePath);
if (isToolCallEventType("bash", event)) {
const bashGuard = shouldBlockPendingGateBash(
event.input.command,
milestoneId,
isQueuePhaseActive(),
);
if (bashGuard.block) return bashGuard;
if (isAutoActive() && !isCanAskUser()) {
clearPendingGate();
} else {
const gateGuard = shouldBlockPendingGate(
event.toolName,
milestoneId,
isQueuePhaseActive(),
);
if (gateGuard.block) return gateGuard;
const milestoneId = getDiscussionMilestoneId(discussionBasePath);
if (isToolCallEventType("bash", event)) {
const bashGuard = shouldBlockPendingGateBash(
event.input.command,
milestoneId,
isQueuePhaseActive(),
);
if (bashGuard.block) return bashGuard;
} else {
const gateGuard = shouldBlockPendingGate(
event.toolName,
milestoneId,
isQueuePhaseActive(),
);
if (gateGuard.block) return gateGuard;
}
}
}
// ── Queue-mode execution guard (#2545): block source-code mutations ──

View file

@ -142,7 +142,7 @@ function warnDeprecatedAgentInstructions() {
export function buildEscalationPolicyBlock(canAskUser) {
const tier3 = canAskUser
? "Ask the user via `ask_user_questions` — but ONLY when (1) and (2) cannot resolve."
: "DO NOT call `ask_user_questions`. If (1) and (2) cannot resolve, exit with a structured blocker message naming the unresolved ambiguity.";
: "DO NOT call `ask_user_questions`. Target confidence ≥ 0.98 via Tier 1 and Tier 2 before acting. If confidence is still < 0.98 after exhausting both tiers, there is a real concrete blocker — exit with a structured blocker message naming the specific unresolvable gap (missing credentials, impossible contract, safety violation, legal scope). Factual questions (versions, API behavior, library defaults) are always Tier 1/2.";
return `\n\n[INTERACTION POLICY — escalation tiers]
Before producing any answer to a question or filling in any unknown, escalate
through these tiers IN ORDER. Skip a tier only when it has been demonstrably

View file

@ -195,7 +195,8 @@ class Gauge {
}
set(labels = {}, value) {
this.values.set(this._key(labels), value);
const safe = Number.isFinite(value) ? value : 0;
this.values.set(this._key(labels), safe);
}
get(labels = {}) {
@ -458,6 +459,9 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
const db = _metricsDb;
if (!db) return;
const ts = new Date().toISOString();
function safeNum(n) {
return Number.isFinite(n) ? n : 0;
}
try {
const insert = db.prepare(
"INSERT INTO metrics (name, type, labels, value, timestamp, session_id) VALUES (?, ?, ?, ?, ?, ?)",
@ -469,7 +473,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
c.name,
"counter",
JSON.stringify(labels),
value ?? 0,
safeNum(value),
ts,
sessionId,
);
@ -482,7 +486,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
g.name,
"gauge",
JSON.stringify(labels),
value ?? 0,
safeNum(value),
ts,
sessionId,
);
@ -493,7 +497,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
h.name,
"histogram",
JSON.stringify({ count: h.count, sum: h.sum }),
h.sum ?? 0,
safeNum(h.sum),
ts,
sessionId,
);

View file

@ -24,7 +24,7 @@ import {
} from "../autonomous-solver.js";
import { triageTodoDump } from "../commands-todo.js";
import { emitJournalEvent, queryJournal } from "../journal.js";
import { readJudgmentLog } from "../judgment-log.js";
import { appendJudgment, readJudgmentLog } from "../judgment-log.js";
import { ModelLearner } from "../model-learner.js";
import { createScheduleStore } from "../schedule/schedule-store.js";
import { closeDatabase, getDatabase } from "../sf-db.js";
@ -216,20 +216,14 @@ describe("SF JSONL schema versioning", () => {
assert.equal(legacy.runId, "legacy-run");
});
test("judgment_log_reads_legacy_jsonl_rows_as_version_1", () => {
test("judgment_log_reads_entries_written_via_appendJudgment", () => {
const project = makeProject();
const path = join(project, ".sf", "judgment-log.jsonl");
writeFileSync(
path,
`${JSON.stringify({
ts: "2026-05-07T00:00:00.000Z",
unitId: "M001/S01/T02",
confidence: "low",
decision: "legacy row",
})}\n`,
"utf-8",
);
appendJudgment(project, {
unitId: "M001/S01/T02",
confidence: "low",
decision: "legacy row",
});
const [entry] = readJudgmentLog(project, "M001");
assert.equal(entry.schemaVersion, 1);

View file

@ -153,40 +153,32 @@ describe("metrics-central", () => {
expect(dashboard.resources.activeSessions).toBe(1);
expect(dashboard.resources.activeAgents).toBe(2);
expect(dashboard.resources.concurrentToolCalls).toBe(3);
expect(getMetricsSystemStats().databaseStatus).toBe("disconnected");
expect(getMetricsSystemStats().databaseStatus).toBe("connected");
});
it("stopMetricsCentral_persists_metrics_to_db_adapter", () => {
const rows = [];
const db = {
exec() {},
prepare(sql) {
if (sql.startsWith("INSERT")) {
return {
run(name, type, labels, value, timestamp, sessionId) {
rows.push({ name, type, labels, value, timestamp, sessionId });
},
};
}
throw new Error(`unexpected SQL: ${sql}`);
},
};
initMetricsCentral("/tmp/test-project", {
dbAdapter: db,
sessionId: "sess-db",
});
recordCounter("sf_test_db_counter", { label: "a=b,c" }, 2);
it("stopMetricsCentral_persists_metrics_to_db", async () => {
const { DatabaseSync } = await import("node:sqlite");
stopMetricsCentral();
// The beforeEach already called initMetricsCentral("/tmp/test-project"),
// so we record in the already-open metrics.db and verify after stop.
recordCounter("sf_test_db_counter", { label: "a=b,c" }, 2);
stopMetricsCentral(); // flush + close; afterEach stopMetricsCentral is a no-op
const dbPath = "/tmp/test-project/.sf/metrics.db";
const db = new DatabaseSync(dbPath, { open: true });
const rows = db
.prepare(
"SELECT name, type, value FROM metrics WHERE name = 'sf_test_db_counter' ORDER BY id DESC LIMIT 10",
)
.all();
db.close();
expect(rows).toEqual(
expect.arrayContaining([
expect.objectContaining({
name: "sf_test_db_counter",
type: "counter",
labels: JSON.stringify({ label: "a=b,c", session_id: "sess-db" }),
value: 2,
sessionId: "sess-db",
}),
]),
);

View file

@ -3,6 +3,8 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, test } from "vitest";
// Import preferences.js so that _initPrefsLoader is called and the circular dep lazy-loader is wired up.
import "../preferences.js";
import { resolveModelWithFallbacksForUnit } from "../preferences-models.js";
import { getConfiguredEnvApiKey } from "../provider-env-auth.js";

View file

@ -223,7 +223,7 @@ test("openDatabase_migrates_v27_tasks_without_created_at_through_spec_backfill",
const version = db
.prepare("SELECT MAX(version) AS version FROM schema_version")
.get();
assert.equal(version.version, 54);
assert.equal(version.version, 57);
const taskSpec = db
.prepare(
"SELECT milestone_id, slice_id, task_id, verify FROM task_specs WHERE task_id = 'T01'",

View file

@ -43,18 +43,48 @@ export function createTurnObserver(options) {
*/
function nextSequenceMetadata(category, operation, metadata) {
if (!writerToken) return metadata ?? {};
const record = nextWriteRecord({
basePath: options.basePath,
token: writerToken,
category,
operation,
metadata,
});
return {
...(metadata ?? {}),
writeSequence: record.sequence.sequence,
writerTokenId: record.writerToken.tokenId,
};
try {
const record = nextWriteRecord({
basePath: options.basePath,
token: writerToken,
category,
operation,
metadata,
});
return {
...(metadata ?? {}),
writeSequence: record.sequence.sequence,
writerTokenId: record.writerToken.tokenId,
};
} catch (err) {
// Token expired (TTL) or lost after process resume — re-acquire and retry once.
if (err?.message?.includes("Writer token is not active")) {
try {
writerToken = acquireWriterToken({
basePath: options.basePath,
traceId: current?.traceId,
turnId: current?.turnId,
});
const record = nextWriteRecord({
basePath: options.basePath,
token: writerToken,
category,
operation,
metadata,
});
return {
...(metadata ?? {}),
writeSequence: record.sequence.sequence,
writerTokenId: record.writerToken.tokenId,
tokenRenewed: true,
};
} catch {
// Re-acquisition failed — continue without sequence metadata rather than crashing.
return metadata ?? {};
}
}
return metadata ?? {};
}
}
return {
onTurnStart(contract) {

View file

@ -5,7 +5,7 @@ import { atomicWriteSync } from "../atomic-write.js";
import { sfRoot } from "../paths.js";
const activeTokens = new Map();
const TOKEN_TTL_MS = 5 * 60 * 1000; // 5 minutes
const TOKEN_TTL_MS = 2 * 60 * 60 * 1000; // 2 hours — autonomous turns can run 20-30+ minutes
function tokenKey(basePath, turnId) {
return `${basePath}:${turnId}`;
}