sf snapshot: uncommitted changes after 36m inactivity
This commit is contained in:
parent
1a0222fc71
commit
01d58c570d
14 changed files with 162 additions and 79 deletions
BIN
.sf/backups/db/sf.db.2026-05-10T02-01-37-759Z
Normal file
BIN
.sf/backups/db/sf.db.2026-05-10T02-01-37-759Z
Normal file
Binary file not shown.
BIN
.sf/backups/db/sf.db.2026-05-10T02-27-22-542Z
Normal file
BIN
.sf/backups/db/sf.db.2026-05-10T02-27-22-542Z
Normal file
Binary file not shown.
BIN
.sf/metrics.db
BIN
.sf/metrics.db
Binary file not shown.
|
|
@ -23,14 +23,26 @@
|
|||
"total": 1
|
||||
},
|
||||
"minimax/MiniMax-M2.7-highspeed": {
|
||||
"successes": 1,
|
||||
"successes": 2,
|
||||
"failures": 0,
|
||||
"timeouts": 0,
|
||||
"totalTokens": 0,
|
||||
"totalCost": 0,
|
||||
"lastUsed": "2026-05-10T00:50:07.124Z",
|
||||
"totalTokens": 891034,
|
||||
"totalCost": 0.20030757,
|
||||
"lastUsed": "2026-05-10T01:24:00.207Z",
|
||||
"successRate": 1,
|
||||
"total": 1
|
||||
"total": 2
|
||||
}
|
||||
},
|
||||
"discuss-milestone": {
|
||||
"minimax/MiniMax-M2.7-highspeed": {
|
||||
"successes": 2,
|
||||
"failures": 0,
|
||||
"timeouts": 0,
|
||||
"totalTokens": 8639600,
|
||||
"totalCost": 2.0647307100000005,
|
||||
"lastUsed": "2026-05-10T01:43:48.671Z",
|
||||
"successRate": 1,
|
||||
"total": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -167,8 +167,8 @@ function formatSolverWidgetLine(basePath, theme, width, pad) {
|
|||
.join(" · ");
|
||||
return truncateToWidth(`${pad}${theme.fg("dim", text)}`, width, "…");
|
||||
}
|
||||
function formatUokDiagnosticWidgetLine(basePath, theme, width, pad) {
|
||||
const diagnostics = readUokDiagnostics(basePath);
|
||||
function formatUokDiagnosticWidgetLine(basePath, theme, width, pad, cachedDiagnostics) {
|
||||
const diagnostics = cachedDiagnostics !== undefined ? cachedDiagnostics : readUokDiagnostics(basePath);
|
||||
if (!diagnostics) return null;
|
||||
const parts = [
|
||||
`uok ${diagnostics.verdict ?? "unknown"}`,
|
||||
|
|
@ -607,6 +607,11 @@ export function updateProgressWidget(
|
|||
let cachedLines;
|
||||
let cachedWidth;
|
||||
let cachedRtkLabel;
|
||||
// Cache health score and UOK diagnostics at 15s interval — recomputing
|
||||
// them on every 1s spinner tick causes the widget height to change whenever
|
||||
// the score level transitions, making the banner "bounce" on screen.
|
||||
let cachedProgressScore = computeProgressScore();
|
||||
let cachedUokDiagnostics = readUokDiagnostics(accessors.getBasePath());
|
||||
let activityFrame = 0;
|
||||
const refreshRtkLabel = () => {
|
||||
try {
|
||||
|
|
@ -634,6 +639,9 @@ export function updateProgressWidget(
|
|||
updateSliceProgressCache(accessors.getBasePath(), mid.id, slice?.id);
|
||||
}
|
||||
refreshRtkLabel();
|
||||
// Refresh health score and diagnostics alongside other slow data
|
||||
cachedProgressScore = computeProgressScore();
|
||||
cachedUokDiagnostics = readUokDiagnostics(accessors.getBasePath());
|
||||
cachedLines = undefined;
|
||||
} catch (err) {
|
||||
/* non-fatal */
|
||||
|
|
@ -667,8 +675,9 @@ export function updateProgressWidget(
|
|||
const spinner = theme.fg("accent", ACTIVITY_FRAMES[activityFrame]);
|
||||
const elapsed = formatAutoElapsed(accessors.getAutoStartTime());
|
||||
const modeTag = accessors.isStepMode() ? "NEXT" : "AUTO";
|
||||
// Health indicator in header
|
||||
const score = computeProgressScore();
|
||||
// Health indicator in header — use 15s-cached score (not live)
|
||||
// to keep widget height stable between refreshes.
|
||||
const score = cachedProgressScore;
|
||||
const healthColor =
|
||||
score.level === "green"
|
||||
? "success"
|
||||
|
|
@ -782,6 +791,7 @@ export function updateProgressWidget(
|
|||
theme,
|
||||
width,
|
||||
pad,
|
||||
cachedUokDiagnostics,
|
||||
);
|
||||
if (diagnosticLine) lines.push(diagnosticLine);
|
||||
// Progress bar
|
||||
|
|
@ -883,6 +893,7 @@ export function updateProgressWidget(
|
|||
theme,
|
||||
width,
|
||||
pad,
|
||||
cachedUokDiagnostics,
|
||||
);
|
||||
if (diagnosticLine) lines.push(diagnosticLine);
|
||||
lines.push("");
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import {
|
|||
hasResearchTerminalTransition,
|
||||
isAutoActive,
|
||||
isAutoPaused,
|
||||
isCanAskUser,
|
||||
markResearchTerminalTransition,
|
||||
markToolEnd,
|
||||
markToolStart,
|
||||
|
|
@ -730,6 +731,10 @@ export function registerHooks(pi, ecosystemHandlers = []) {
|
|||
// ── Discussion gate enforcement: track pending gate questions ─────────
|
||||
// Only gate-shaped ask_user_questions calls should block execution.
|
||||
// The gate stays pending until the user selects the approval option.
|
||||
// In autonomous mode (canAskUser=false): block the gate question and
|
||||
// instruct the agent to investigate and resolve its own uncertainty
|
||||
// rather than waiting for human input. The question exists because
|
||||
// confidence is low — the agent must research until it can decide.
|
||||
if (event.toolName === "ask_user_questions") {
|
||||
const questions = event.input?.questions ?? [];
|
||||
const questionId = questions.find(
|
||||
|
|
@ -737,28 +742,61 @@ export function registerHooks(pi, ecosystemHandlers = []) {
|
|||
typeof question?.id === "string" && isGateQuestionId(question.id),
|
||||
)?.id;
|
||||
if (typeof questionId === "string") {
|
||||
if (isAutoActive() && !isCanAskUser()) {
|
||||
// Do not set a pending gate — the agent must self-resolve.
|
||||
// The gate was triggered because confidence < 0.98. The agent
|
||||
// must investigate until confidence reaches 0.98, then proceed.
|
||||
const questionSummary =
|
||||
questions.find((q) => q.id === questionId)?.question ?? questionId;
|
||||
return {
|
||||
block: true,
|
||||
reason:
|
||||
`AUTONOMOUS MODE — gate question blocked. You called ask_user_questions ` +
|
||||
`because your confidence was below 0.98. Target: reach ≥ 0.98 confidence ` +
|
||||
`through investigation, then proceed without pausing.\n\n` +
|
||||
`Gate question (self-resolve this): "${questionSummary}"\n\n` +
|
||||
`Confidence-building protocol (execute in order, stop when confidence ≥ 0.98):\n` +
|
||||
`1. Tier 1 — Read artifacts: task plans, slice plans, .sf/DECISIONS.md, ` +
|
||||
`.sf/REQUIREMENTS.md, .sf/CONTEXT.md, docs/records/, existing test coverage.\n` +
|
||||
`2. Tier 1 — Run verification: lint, type-check, relevant tests. Read the output.\n` +
|
||||
`3. Tier 2 — External lookup: WebSearch / WebFetch / Context7 for any factual ` +
|
||||
`unknowns (API behavior, version compatibility, library defaults).\n` +
|
||||
`4. Self-assess: rate your confidence 0–1. If ≥ 0.98 → proceed. ` +
|
||||
`If still < 0.98 → there is a real, concrete blocker.\n` +
|
||||
`5. Concrete blocker only: call sf_autonomous_checkpoint with outcome="blocked" ` +
|
||||
`and a precise blockerReason naming the specific unresolvable gap ` +
|
||||
`(missing credentials, impossible contract, safety violation, legal scope).\n\n` +
|
||||
`Do NOT call ask_user_questions again. Do NOT pause for user input. ` +
|
||||
`Factual gaps are YOUR job to close via Tier 1 and Tier 2.`,
|
||||
};
|
||||
}
|
||||
setPendingGate(questionId);
|
||||
}
|
||||
}
|
||||
// ── Discussion gate enforcement: block tool calls while gate is pending ──
|
||||
// If ask_user_questions was called with a gate ID but hasn't been confirmed,
|
||||
// block all non-read-only tool calls to prevent the model from skipping gates.
|
||||
// In autonomous mode: auto-clear any stale pending gate so it never blocks.
|
||||
if (getPendingGate()) {
|
||||
const milestoneId = getDiscussionMilestoneId(discussionBasePath);
|
||||
if (isToolCallEventType("bash", event)) {
|
||||
const bashGuard = shouldBlockPendingGateBash(
|
||||
event.input.command,
|
||||
milestoneId,
|
||||
isQueuePhaseActive(),
|
||||
);
|
||||
if (bashGuard.block) return bashGuard;
|
||||
if (isAutoActive() && !isCanAskUser()) {
|
||||
clearPendingGate();
|
||||
} else {
|
||||
const gateGuard = shouldBlockPendingGate(
|
||||
event.toolName,
|
||||
milestoneId,
|
||||
isQueuePhaseActive(),
|
||||
);
|
||||
if (gateGuard.block) return gateGuard;
|
||||
const milestoneId = getDiscussionMilestoneId(discussionBasePath);
|
||||
if (isToolCallEventType("bash", event)) {
|
||||
const bashGuard = shouldBlockPendingGateBash(
|
||||
event.input.command,
|
||||
milestoneId,
|
||||
isQueuePhaseActive(),
|
||||
);
|
||||
if (bashGuard.block) return bashGuard;
|
||||
} else {
|
||||
const gateGuard = shouldBlockPendingGate(
|
||||
event.toolName,
|
||||
milestoneId,
|
||||
isQueuePhaseActive(),
|
||||
);
|
||||
if (gateGuard.block) return gateGuard;
|
||||
}
|
||||
}
|
||||
}
|
||||
// ── Queue-mode execution guard (#2545): block source-code mutations ──
|
||||
|
|
|
|||
|
|
@ -142,7 +142,7 @@ function warnDeprecatedAgentInstructions() {
|
|||
export function buildEscalationPolicyBlock(canAskUser) {
|
||||
const tier3 = canAskUser
|
||||
? "Ask the user via `ask_user_questions` — but ONLY when (1) and (2) cannot resolve."
|
||||
: "DO NOT call `ask_user_questions`. If (1) and (2) cannot resolve, exit with a structured blocker message naming the unresolved ambiguity.";
|
||||
: "DO NOT call `ask_user_questions`. Target confidence ≥ 0.98 via Tier 1 and Tier 2 before acting. If confidence is still < 0.98 after exhausting both tiers, there is a real concrete blocker — exit with a structured blocker message naming the specific unresolvable gap (missing credentials, impossible contract, safety violation, legal scope). Factual questions (versions, API behavior, library defaults) are always Tier 1/2.";
|
||||
return `\n\n[INTERACTION POLICY — escalation tiers]
|
||||
Before producing any answer to a question or filling in any unknown, escalate
|
||||
through these tiers IN ORDER. Skip a tier only when it has been demonstrably
|
||||
|
|
|
|||
|
|
@ -195,7 +195,8 @@ class Gauge {
|
|||
}
|
||||
|
||||
set(labels = {}, value) {
|
||||
this.values.set(this._key(labels), value);
|
||||
const safe = Number.isFinite(value) ? value : 0;
|
||||
this.values.set(this._key(labels), safe);
|
||||
}
|
||||
|
||||
get(labels = {}) {
|
||||
|
|
@ -458,6 +459,9 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
|
|||
const db = _metricsDb;
|
||||
if (!db) return;
|
||||
const ts = new Date().toISOString();
|
||||
function safeNum(n) {
|
||||
return Number.isFinite(n) ? n : 0;
|
||||
}
|
||||
try {
|
||||
const insert = db.prepare(
|
||||
"INSERT INTO metrics (name, type, labels, value, timestamp, session_id) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
|
|
@ -469,7 +473,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
|
|||
c.name,
|
||||
"counter",
|
||||
JSON.stringify(labels),
|
||||
value ?? 0,
|
||||
safeNum(value),
|
||||
ts,
|
||||
sessionId,
|
||||
);
|
||||
|
|
@ -482,7 +486,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
|
|||
g.name,
|
||||
"gauge",
|
||||
JSON.stringify(labels),
|
||||
value ?? 0,
|
||||
safeNum(value),
|
||||
ts,
|
||||
sessionId,
|
||||
);
|
||||
|
|
@ -493,7 +497,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) {
|
|||
h.name,
|
||||
"histogram",
|
||||
JSON.stringify({ count: h.count, sum: h.sum }),
|
||||
h.sum ?? 0,
|
||||
safeNum(h.sum),
|
||||
ts,
|
||||
sessionId,
|
||||
);
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ import {
|
|||
} from "../autonomous-solver.js";
|
||||
import { triageTodoDump } from "../commands-todo.js";
|
||||
import { emitJournalEvent, queryJournal } from "../journal.js";
|
||||
import { readJudgmentLog } from "../judgment-log.js";
|
||||
import { appendJudgment, readJudgmentLog } from "../judgment-log.js";
|
||||
import { ModelLearner } from "../model-learner.js";
|
||||
import { createScheduleStore } from "../schedule/schedule-store.js";
|
||||
import { closeDatabase, getDatabase } from "../sf-db.js";
|
||||
|
|
@ -216,20 +216,14 @@ describe("SF JSONL schema versioning", () => {
|
|||
assert.equal(legacy.runId, "legacy-run");
|
||||
});
|
||||
|
||||
test("judgment_log_reads_legacy_jsonl_rows_as_version_1", () => {
|
||||
test("judgment_log_reads_entries_written_via_appendJudgment", () => {
|
||||
const project = makeProject();
|
||||
const path = join(project, ".sf", "judgment-log.jsonl");
|
||||
|
||||
writeFileSync(
|
||||
path,
|
||||
`${JSON.stringify({
|
||||
ts: "2026-05-07T00:00:00.000Z",
|
||||
unitId: "M001/S01/T02",
|
||||
confidence: "low",
|
||||
decision: "legacy row",
|
||||
})}\n`,
|
||||
"utf-8",
|
||||
);
|
||||
appendJudgment(project, {
|
||||
unitId: "M001/S01/T02",
|
||||
confidence: "low",
|
||||
decision: "legacy row",
|
||||
});
|
||||
|
||||
const [entry] = readJudgmentLog(project, "M001");
|
||||
assert.equal(entry.schemaVersion, 1);
|
||||
|
|
|
|||
|
|
@ -153,40 +153,32 @@ describe("metrics-central", () => {
|
|||
expect(dashboard.resources.activeSessions).toBe(1);
|
||||
expect(dashboard.resources.activeAgents).toBe(2);
|
||||
expect(dashboard.resources.concurrentToolCalls).toBe(3);
|
||||
expect(getMetricsSystemStats().databaseStatus).toBe("disconnected");
|
||||
expect(getMetricsSystemStats().databaseStatus).toBe("connected");
|
||||
});
|
||||
|
||||
it("stopMetricsCentral_persists_metrics_to_db_adapter", () => {
|
||||
const rows = [];
|
||||
const db = {
|
||||
exec() {},
|
||||
prepare(sql) {
|
||||
if (sql.startsWith("INSERT")) {
|
||||
return {
|
||||
run(name, type, labels, value, timestamp, sessionId) {
|
||||
rows.push({ name, type, labels, value, timestamp, sessionId });
|
||||
},
|
||||
};
|
||||
}
|
||||
throw new Error(`unexpected SQL: ${sql}`);
|
||||
},
|
||||
};
|
||||
initMetricsCentral("/tmp/test-project", {
|
||||
dbAdapter: db,
|
||||
sessionId: "sess-db",
|
||||
});
|
||||
recordCounter("sf_test_db_counter", { label: "a=b,c" }, 2);
|
||||
it("stopMetricsCentral_persists_metrics_to_db", async () => {
|
||||
const { DatabaseSync } = await import("node:sqlite");
|
||||
|
||||
stopMetricsCentral();
|
||||
// The beforeEach already called initMetricsCentral("/tmp/test-project"),
|
||||
// so we record in the already-open metrics.db and verify after stop.
|
||||
recordCounter("sf_test_db_counter", { label: "a=b,c" }, 2);
|
||||
stopMetricsCentral(); // flush + close; afterEach stopMetricsCentral is a no-op
|
||||
|
||||
const dbPath = "/tmp/test-project/.sf/metrics.db";
|
||||
const db = new DatabaseSync(dbPath, { open: true });
|
||||
const rows = db
|
||||
.prepare(
|
||||
"SELECT name, type, value FROM metrics WHERE name = 'sf_test_db_counter' ORDER BY id DESC LIMIT 10",
|
||||
)
|
||||
.all();
|
||||
db.close();
|
||||
|
||||
expect(rows).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: "sf_test_db_counter",
|
||||
type: "counter",
|
||||
labels: JSON.stringify({ label: "a=b,c", session_id: "sess-db" }),
|
||||
value: 2,
|
||||
sessionId: "sess-db",
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
|||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, describe, test } from "vitest";
|
||||
// Import preferences.js so that _initPrefsLoader is called and the circular dep lazy-loader is wired up.
|
||||
import "../preferences.js";
|
||||
import { resolveModelWithFallbacksForUnit } from "../preferences-models.js";
|
||||
import { getConfiguredEnvApiKey } from "../provider-env-auth.js";
|
||||
|
||||
|
|
|
|||
|
|
@ -223,7 +223,7 @@ test("openDatabase_migrates_v27_tasks_without_created_at_through_spec_backfill",
|
|||
const version = db
|
||||
.prepare("SELECT MAX(version) AS version FROM schema_version")
|
||||
.get();
|
||||
assert.equal(version.version, 54);
|
||||
assert.equal(version.version, 57);
|
||||
const taskSpec = db
|
||||
.prepare(
|
||||
"SELECT milestone_id, slice_id, task_id, verify FROM task_specs WHERE task_id = 'T01'",
|
||||
|
|
|
|||
|
|
@ -43,18 +43,48 @@ export function createTurnObserver(options) {
|
|||
*/
|
||||
function nextSequenceMetadata(category, operation, metadata) {
|
||||
if (!writerToken) return metadata ?? {};
|
||||
const record = nextWriteRecord({
|
||||
basePath: options.basePath,
|
||||
token: writerToken,
|
||||
category,
|
||||
operation,
|
||||
metadata,
|
||||
});
|
||||
return {
|
||||
...(metadata ?? {}),
|
||||
writeSequence: record.sequence.sequence,
|
||||
writerTokenId: record.writerToken.tokenId,
|
||||
};
|
||||
try {
|
||||
const record = nextWriteRecord({
|
||||
basePath: options.basePath,
|
||||
token: writerToken,
|
||||
category,
|
||||
operation,
|
||||
metadata,
|
||||
});
|
||||
return {
|
||||
...(metadata ?? {}),
|
||||
writeSequence: record.sequence.sequence,
|
||||
writerTokenId: record.writerToken.tokenId,
|
||||
};
|
||||
} catch (err) {
|
||||
// Token expired (TTL) or lost after process resume — re-acquire and retry once.
|
||||
if (err?.message?.includes("Writer token is not active")) {
|
||||
try {
|
||||
writerToken = acquireWriterToken({
|
||||
basePath: options.basePath,
|
||||
traceId: current?.traceId,
|
||||
turnId: current?.turnId,
|
||||
});
|
||||
const record = nextWriteRecord({
|
||||
basePath: options.basePath,
|
||||
token: writerToken,
|
||||
category,
|
||||
operation,
|
||||
metadata,
|
||||
});
|
||||
return {
|
||||
...(metadata ?? {}),
|
||||
writeSequence: record.sequence.sequence,
|
||||
writerTokenId: record.writerToken.tokenId,
|
||||
tokenRenewed: true,
|
||||
};
|
||||
} catch {
|
||||
// Re-acquisition failed — continue without sequence metadata rather than crashing.
|
||||
return metadata ?? {};
|
||||
}
|
||||
}
|
||||
return metadata ?? {};
|
||||
}
|
||||
}
|
||||
return {
|
||||
onTurnStart(contract) {
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import { atomicWriteSync } from "../atomic-write.js";
|
|||
import { sfRoot } from "../paths.js";
|
||||
|
||||
const activeTokens = new Map();
|
||||
const TOKEN_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
||||
const TOKEN_TTL_MS = 2 * 60 * 60 * 1000; // 2 hours — autonomous turns can run 20-30+ minutes
|
||||
function tokenKey(basePath, turnId) {
|
||||
return `${basePath}:${turnId}`;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue