sf snapshot: uncommitted changes after 131m inactivity

2026-05-09 02:53:47 +02:00 · 2026-05-09 02:53:47 +02:00 · 9875812c1b
commit 9875812c1b
parent 5188b93ddc
44 changed files with 1149 additions and 87 deletions
--- a/.sf/backups/db/maintenance.json
+++ b/.sf/backups/db/maintenance.json
@ -0,0 +1,3 @@
+{
+  "lastFullVacuumAt": "2026-05-08T20:15:21.317Z"
+}
--- a/.sf/backups/db/sf.db.2026-05-08T20-20-34-822Z
+++ b/.sf/backups/db/sf.db.2026-05-08T20-20-34-822Z
--- a/.sf/backups/db/sf.db.2026-05-08T20-44-13-669Z
+++ b/.sf/backups/db/sf.db.2026-05-08T20-44-13-669Z
--- a/.sf/backups/db/sf.db.2026-05-08T22-14-57-817Z
+++ b/.sf/backups/db/sf.db.2026-05-08T22-14-57-817Z
--- a/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z
+++ b/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z
--- a/.sf/backups/db/sf.db.20260508-220250
+++ b/.sf/backups/db/sf.db.20260508-220250
--- a/.sf/recovery/sf.db-shm.corrupt-20260508-220021
+++ b/.sf/recovery/sf.db-shm.corrupt-20260508-220021
--- a/.sf/recovery/sf.db-shm.replaced-corrupt-20260508-220115
+++ b/.sf/recovery/sf.db-shm.replaced-corrupt-20260508-220115
--- a/.sf/recovery/sf.db-wal.corrupt-20260508-220021
+++ b/.sf/recovery/sf.db-wal.corrupt-20260508-220021
--- a/.sf/recovery/sf.db-wal.replaced-corrupt-20260508-220115
+++ b/.sf/recovery/sf.db-wal.replaced-corrupt-20260508-220115
--- a/.sf/recovery/sf.db.corrupt-20260508-220021
+++ b/.sf/recovery/sf.db.corrupt-20260508-220021
--- a/.sf/recovery/sf.db.recovered-20260508-220103
+++ b/.sf/recovery/sf.db.recovered-20260508-220103
--- a/.sf/recovery/sf.db.replaced-corrupt-20260508-220115
+++ b/.sf/recovery/sf.db.replaced-corrupt-20260508-220115
--- a/.sf/recovery/stray-root-20260509-023724/backups/db/maintenance.json
+++ b/.sf/recovery/stray-root-20260509-023724/backups/db/maintenance.json
@ -0,0 +1,3 @@
+{
+  "lastFullVacuumAt": "2026-05-08T20:29:49.200Z"
+}
--- a/.sf/recovery/stray-root-20260509-023724/backups/db/sf.db.2026-05-08T20-29-49-149Z
+++ b/.sf/recovery/stray-root-20260509-023724/backups/db/sf.db.2026-05-08T20-29-49-149Z
--- a/.sf/recovery/stray-root-20260509-023724/global
+++ b/.sf/recovery/stray-root-20260509-023724/global
--- a/package-lock.json
+++ b/package-lock.json
@ -5708,6 +5708,10 @@
 		"node_modules/@singularity-forge/engine-win32-x64-msvc": {
 			"optional": true
 		},
+		"node_modules/@singularity-forge/google-gemini-cli-provider": {
+			"resolved": "packages/google-gemini-cli-provider",
+			"link": true
+		},
 		"node_modules/@singularity-forge/native": {
 			"resolved": "packages/native",
 			"link": true
@ -14618,6 +14622,16 @@
 				"url": "https://github.com/sponsors/colinhacks"
 			}
 		},
+		"packages/google-gemini-cli-provider": {
+			"name": "@singularity-forge/google-gemini-cli-provider",
+			"version": "2.75.3",
+			"dependencies": {
+				"@google/gemini-cli-core": "0.40.1"
+			},
+			"engines": {
+				"node": ">=26.1.0"
+			}
+		},
 		"packages/native": {
 			"name": "@singularity-forge/native",
 			"version": "2.75.3",
@ -14651,6 +14665,7 @@
 				"@google/genai": "^1.40.0",
 				"@mistralai/mistralai": "^2.2.1",
 				"@sinclair/typebox": "^0.34.41",
+				"@singularity-forge/google-gemini-cli-provider": "^2.75.3",
 				"ajv": "^8.17.1",
 				"ajv-formats": "^3.0.1",
 				"chalk": "^5.6.2",
--- a/package.json
+++ b/package.json
@ -48,7 +48,8 @@
 		"build:pi-coding-agent": "npm --workspace @singularity-forge/pi-coding-agent run build",
 		"build:native-pkg": "npm --workspace @singularity-forge/native run build",
 		"build:rpc-client": "npm --workspace @singularity-forge/rpc-client run build",
-		"build:pi": "npm run build:native-pkg && npm run build:pi-tui && npm run build:pi-ai && npm run build:pi-agent-core && npm run build:pi-coding-agent",
+		"build:google-gemini-cli-provider": "npm --workspace @singularity-forge/google-gemini-cli-provider run build",
+		"build:pi": "npm run build:native-pkg && npm run build:pi-tui && npm run build:google-gemini-cli-provider && npm run build:pi-ai && npm run build:pi-agent-core && npm run build:pi-coding-agent",
 		"build:daemon": "npm --workspace @singularity-forge/daemon run build",
 		"build:core": "npm run build:pi && npm run build:rpc-client && npm run build:daemon && npm run check:versioned-json && tsc && npm run copy-resources && npm run copy-themes && npm run copy-export-html",
 		"build": "npm run build:core && node scripts/build-web-if-stale.cjs",
--- a/packages/google-gemini-cli-provider/package.json
+++ b/packages/google-gemini-cli-provider/package.json
@ -0,0 +1,23 @@
+{
+	"name": "@singularity-forge/google-gemini-cli-provider",
+	"version": "2.75.3",
+	"description": "Gemini CLI Core transport helper for SF providers",
+	"type": "module",
+	"main": "./dist/index.js",
+	"types": "./dist/index.d.ts",
+	"exports": {
+		".": {
+			"types": "./dist/index.d.ts",
+			"import": "./dist/index.js"
+		}
+	},
+	"scripts": {
+		"build": "tsc -p tsconfig.json"
+	},
+	"dependencies": {
+		"@google/gemini-cli-core": "0.40.1"
+	},
+	"engines": {
+		"node": ">=26.1.0"
+	}
+}
--- a/packages/google-gemini-cli-provider/src/index.test.ts
+++ b/packages/google-gemini-cli-provider/src/index.test.ts
@ -0,0 +1,40 @@
+import assert from "node:assert/strict";
+import { describe, test, vi } from "vitest";
+
+const helperState = vi.hoisted(() => ({
+	authType: undefined as unknown,
+	configParams: undefined as Record<string, unknown> | undefined,
+}));
+
+vi.mock("@google/gemini-cli-core", () => ({
+	AuthType: { LOGIN_WITH_GOOGLE: "LOGIN_WITH_GOOGLE" },
+	makeFakeConfig: vi.fn((params: Record<string, unknown>) => {
+		helperState.configParams = params;
+		return { params };
+	}),
+}));
+
+vi.mock("@google/gemini-cli-core/dist/src/core/contentGenerator.js", () => ({
+	createContentGeneratorConfig: vi.fn(async (_config, authType) => {
+		helperState.authType = authType;
+		return { authType };
+	}),
+	createContentGenerator: vi.fn(async () => ({
+		async generateContentStream(): Promise<AsyncGenerator<unknown>> {
+			return (async function* emptyStream() {})();
+		},
+	})),
+}));
+
+import { createGeminiCliContentGenerator } from "./index.js";
+
+describe("google-gemini-cli-provider", () => {
+	test("createGeminiCliContentGenerator_uses_google_login_auth", async () => {
+		await createGeminiCliContentGenerator({ modelId: "gemini-3-pro" });
+
+		assert.equal(helperState.authType, "LOGIN_WITH_GOOGLE");
+		assert.equal(helperState.configParams?.model, "gemini-3-pro");
+		assert.equal(helperState.configParams?.cwd, process.cwd());
+		assert.equal(helperState.configParams?.targetDir, process.cwd());
+	});
+});
--- a/packages/google-gemini-cli-provider/src/index.ts
+++ b/packages/google-gemini-cli-provider/src/index.ts
@ -0,0 +1,48 @@
+/**
+ * Google Gemini CLI transport helper.
+ *
+ * Purpose: keep the Gemini CLI Core auth and content-generator wiring in a
+ * dedicated workspace package so provider code can depend on one small helper
+ * instead of embedding the upstream integration inline.
+ *
+ * Consumer: `@singularity-forge/pi-ai` Google Gemini provider.
+ */
+import {
+	AuthType,
+	makeFakeConfig,
+} from "@google/gemini-cli-core";
+import {
+	createContentGenerator,
+	createContentGeneratorConfig,
+	type ContentGenerator,
+} from "@google/gemini-cli-core/dist/src/core/contentGenerator.js";
+
+export interface GeminiCliContentGeneratorOptions {
+	modelId: string;
+	cwd?: string;
+	targetDir?: string;
+}
+
+/**
+ * Create a Gemini CLI Core content generator for a model.
+ *
+ * Purpose: centralize the Code Assist setup and OAuth bootstrap logic in a
+ * reusable package so SF's Gemini provider can stay focused on stream shaping.
+ *
+ * Consumer: the Google Gemini provider in pi-ai.
+ */
+export async function createGeminiCliContentGenerator(
+	options: GeminiCliContentGeneratorOptions,
+): Promise<ContentGenerator> {
+	const cwd = options.cwd ?? process.cwd();
+	const config = makeFakeConfig({
+		model: options.modelId,
+		cwd,
+		targetDir: options.targetDir ?? cwd,
+	});
+	const generatorConfig = await createContentGeneratorConfig(
+		config,
+		AuthType.LOGIN_WITH_GOOGLE,
+	);
+	return createContentGenerator(generatorConfig, config);
+}
--- a/packages/google-gemini-cli-provider/tsconfig.json
+++ b/packages/google-gemini-cli-provider/tsconfig.json
@ -0,0 +1,28 @@
+{
+	"compilerOptions": {
+		"target": "ES2024",
+		"module": "Node16",
+		"lib": ["ES2024"],
+		"strict": true,
+		"esModuleInterop": true,
+		"skipLibCheck": true,
+		"incremental": true,
+		"forceConsistentCasingInFileNames": true,
+		"declaration": true,
+		"declarationMap": true,
+		"sourceMap": true,
+		"inlineSources": true,
+		"inlineSourceMap": false,
+		"moduleResolution": "Node16",
+		"resolveJsonModule": true,
+		"allowImportingTsExtensions": false,
+		"experimentalDecorators": true,
+		"emitDecoratorMetadata": true,
+		"useDefineForClassFields": false,
+		"types": ["node"],
+		"outDir": "./dist",
+		"rootDir": "./src"
+	},
+	"include": ["src/**/*.ts"],
+	"exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"]
+}
--- a/packages/pi-ai/package.json
+++ b/packages/pi-ai/package.json
@ -29,6 +29,7 @@
 		"@google/gemini-cli-core": "0.40.1",
 		"@google/genai": "^1.40.0",
 		"@mistralai/mistralai": "^2.2.1",
+		"@singularity-forge/google-gemini-cli-provider": "^2.75.3",
 		"@sinclair/typebox": "^0.34.41",
 		"ajv": "^8.17.1",
 		"ajv-formats": "^3.0.1",
--- a/packages/pi-ai/src/providers/google-gemini-cli.test.ts
+++ b/packages/pi-ai/src/providers/google-gemini-cli.test.ts
@ -5,41 +5,34 @@ import type { Context, Model } from "../types.js";
 const geminiCliCore = vi.hoisted(() => ({
 	retryError: undefined as Error | undefined,
 	retryOptions: undefined as Record<string, unknown> | undefined,
-	fakeConfigParams: undefined as Record<string, unknown> | undefined,
-	generatorAuthType: undefined as unknown,
+	helperArgs: undefined as Record<string, unknown> | undefined,
 }));

 vi.mock("@google/gemini-cli-core", () => ({
-	AuthType: { LOGIN_WITH_GOOGLE: "LOGIN_WITH_GOOGLE" },
 	CodeAssistServer: class {
 		async generateContentStream(): Promise<AsyncGenerator<unknown>> {
 			return (async function* emptyStream() {})();
 		}
 	},
-	getOauthClient: vi.fn(async () => ({})),
-	makeFakeConfig: vi.fn((params: Record<string, unknown>) => {
-		geminiCliCore.fakeConfigParams = params;
-		return { params };
-	}),
 	retryWithBackoff: vi.fn(
 		async (_fn: unknown, options: Record<string, unknown>) => {
 			geminiCliCore.retryOptions = options;
 			throw geminiCliCore.retryError ?? new Error("quota exhausted");
 		},
 	),
-	setupUser: vi.fn(async () => ({ projectId: "test-project" })),
 }));

-vi.mock("@google/gemini-cli-core/dist/src/core/contentGenerator.js", () => ({
-	createContentGeneratorConfig: vi.fn(async (_config, authType) => {
-		geminiCliCore.generatorAuthType = authType;
-		return { authType };
-	}),
-	createContentGenerator: vi.fn(async () => ({
-		async generateContentStream(): Promise<AsyncGenerator<unknown>> {
-			return (async function* emptyStream() {})();
+vi.mock("@singularity-forge/google-gemini-cli-provider", () => ({
+	createGeminiCliContentGenerator: vi.fn(
+		async (args: Record<string, unknown>) => {
+			geminiCliCore.helperArgs = args;
+			return {
+				async generateContentStream(): Promise<AsyncGenerator<unknown>> {
+					return (async function* emptyStream() {})();
+				},
+			};
 		},
-	})),
+	),
 }));

 import { streamGoogleGeminiCli } from "./google-gemini-cli.js";
@ -82,12 +75,7 @@ describe("google-gemini-cli provider retry ownership", () => {
 			| { maxAttempts?: unknown }
 			| undefined;
 		assert.equal(retryOptions?.maxAttempts, 1);
-		assert.equal(
-			geminiCliCore.fakeConfigParams?.model,
-			"gemini-3-flash-preview",
-		);
-		assert.equal(geminiCliCore.fakeConfigParams?.clientName, undefined);
-		assert.equal(geminiCliCore.generatorAuthType, "LOGIN_WITH_GOOGLE");
+		assert.equal(geminiCliCore.helperArgs?.modelId, "gemini-3-flash-preview");
 		assert.equal(result.stopReason, "error");
 		assert.match(result.errorMessage ?? "", /exhausted your capacity/i);
 		assert.equal(result.retryAfterMs, 54_000);
--- a/packages/pi-ai/src/providers/google-gemini-cli.ts
+++ b/packages/pi-ai/src/providers/google-gemini-cli.ts
@ -1,24 +1,12 @@
 /**
 * Google Gemini CLI provider.
 *
- * Delegates auth, project discovery, and the Code Assist transport to
- * @google/gemini-cli-core — the library behind Google's Gemini tooling.
- * cli-core reads ~/.gemini/oauth_creds.json itself when present, refreshes tokens,
- * discovers the project (free-tier or whatever's onboarded server-side)
- * via setupUser(), and handles all the User-Agent / quota-classification details.
+ * Delegates auth, project discovery, and the Code Assist transport setup to
+ * the dedicated google-gemini-cli-provider package.
 * Request retry/fallback stays in the caller so SF can move to the next model.
 */

-import {
-	AuthType,
-	makeFakeConfig,
-	retryWithBackoff,
-} from "@google/gemini-cli-core";
-import type { ContentGenerator } from "@google/gemini-cli-core/dist/src/core/contentGenerator.js";
-import {
-	createContentGenerator,
-	createContentGeneratorConfig,
-} from "@google/gemini-cli-core/dist/src/core/contentGenerator.js";
+import { retryWithBackoff } from "@google/gemini-cli-core";
 import type {
 	Content,
 	GenerateContentParameters,
@ -55,6 +43,7 @@ import {
 	isAutoReasoning,
 	resolveReasoningLevel,
 } from "./simple-options.js";
+import { createGeminiCliContentGenerator } from "@singularity-forge/google-gemini-cli-provider";

 /**
 * Thinking level for Gemini 3 models.
@ -73,7 +62,8 @@ export type GoogleThinkingLevel =
 /**
 * Options for `streamGoogleGeminiCli()`.
 *
- * Delegates auth to cli-core (reads ~/.gemini/oauth_creds.json via `getOauthClient()`);
+ * Delegates auth to the helper package (reads ~/.gemini/oauth_creds.json via
+ * Gemini CLI Core's transport setup);
 * `projectId` is auto-discovered and not used by this provider (apiKey is ignored).
 * Thinking is configured separately from base `StreamOptions` because Gemini 2 and 3
 * models use incompatible enum formats (budgetTokens vs. level).
@ -100,30 +90,6 @@ export interface GoogleGeminiCliOptions extends StreamOptions {
 // Counter for generating unique tool call IDs
 let toolCallCounter = 0;

-/**
- * Build a Code Assist content generator using cli-core's official content-generator path.
- *
- * Upstream Gemini CLI does not instantiate CodeAssistServer directly from the
- * caller. It creates a ContentGeneratorConfig, lets createContentGenerator()
- * build the GeminiCLI User-Agent and transport headers, then delegates to
- * createCodeAssistContentGenerator() for OAuth, setupUser(), and Code Assist.
- *
- * Both calls memoize internally inside cli-core — repeat invocations are
- * cheap.
- */
-async function getCodeAssistServer(modelId: string): Promise<ContentGenerator> {
-	const config = makeFakeConfig({
-		model: modelId,
-		cwd: process.cwd(),
-		targetDir: process.cwd(),
-	});
-	const generatorConfig = await createContentGeneratorConfig(
-		config,
-		AuthType.LOGIN_WITH_GOOGLE,
-	);
-	return createContentGenerator(generatorConfig, config);
-}
-
 function parseDurationMs(value: string): number | undefined {
 	const match = value.match(/(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?/i);
 	if (!match || !match[0]) return undefined;
@ -178,14 +144,14 @@ function isGemini3Model(modelId: string): boolean {
 }

 /**
- * Stream a chat completion from Google Gemini via the cli-core transport.
+ * Stream a chat completion from Google Gemini via the helper package and cli-core transport.
 *
- * Auth is handled transparently by cli-core (`getCodeAssistServer()` reads OAuth creds from
- * ~/.gemini/oauth_creds.json and triggers browser OAuth on first run). Project ID is auto-discovered
- * from the Code Assist API; `apiKey` is ignored. Casting the request as `any` works around the fact
- * that cli-core bundles its own nested `@google/genai` copy (nominal type split at packaging time;
- * runtime shapes are byte-identical). Returns a real-time stream emitting start, delta, end, and
- * error events that accumulate into an `AssistantMessage`.
+ * The helper package owns the OAuth/bootstrap path against `@google/gemini-cli-core`, including
+ * `~/.gemini/oauth_creds.json` and Gemini Code Assist project discovery. `apiKey` is ignored.
+ * Casting the request as `any` works around the fact that cli-core bundles its own nested
+ * `@google/genai` copy (nominal type split at packaging time; runtime shapes are byte-identical).
+ * Returns a real-time stream emitting start, delta, end, and error events that accumulate into
+ * an `AssistantMessage`.
 */
 export const streamGoogleGeminiCli: StreamFunction<
 	"google-gemini-cli",
@ -222,9 +188,10 @@ export const streamGoogleGeminiCli: StreamFunction<
 			if (nextReq !== undefined) {
 				req = nextReq as GenerateContentParameters;
 			}
-			// cli-core handles auth + project discovery. SF uses cli-core directly
-			// and does not spawn a separate provider CLI process.
-			const server = await getCodeAssistServer(req.model);
+			// cli-core handles auth + project discovery through the helper package.
+			const server = await createGeminiCliContentGenerator({
+				modelId: req.model,
+			});
 			const promptId = `pi-${Date.now()}-${Math.random().toString(36).slice(2, 11)}`;
 			// Cast through `any` — cli-core bundles its own nested @google/genai copy,
 			// so TypeScript sees two structurally-identical-but-distinct Content types.
@ -233,7 +200,6 @@ export const streamGoogleGeminiCli: StreamFunction<
 			const streamGen = await retryWithBackoff(
 				() => server.generateContentStream(req as any, promptId, "USER" as any),
 				{
-					authType: AuthType.LOGIN_WITH_GOOGLE,
 					// SF owns cross-model fallback. Let cli-core classify quota errors,
 					// but do not let it hold the turn through its 10-attempt retry loop.
 					maxAttempts: 1,
--- a/packages/pi-coding-agent/src/cli/args.test.ts
+++ b/packages/pi-coding-agent/src/cli/args.test.ts
@ -5,15 +5,15 @@ import { parseArgs } from "./args.js";
 describe("parseArgs", () => {
 	it("parses optional-value extension flags with implicit and explicit values", () => {
 		const extensionFlags = new Map([
-			["genai-proxy", { type: "string" as const, allowNoValue: true }],
+			["demo-flag", { type: "string" as const, allowNoValue: true }],
 		]);
-		const defaultFlagArgs = parseArgs(["--genai-proxy"], extensionFlags);
-		const explicitFlagArgs = parseArgs(["--genai-proxy=8080"], extensionFlags);
+		const defaultFlagArgs = parseArgs(["--demo-flag"], extensionFlags);
+		const explicitFlagArgs = parseArgs(["--demo-flag=8080"], extensionFlags);

 		assert.deepEqual(
 			[
-				defaultFlagArgs.unknownFlags.get("genai-proxy"),
-				explicitFlagArgs.unknownFlags.get("genai-proxy"),
+				defaultFlagArgs.unknownFlags.get("demo-flag"),
+				explicitFlagArgs.unknownFlags.get("demo-flag"),
 			],
 			[true, "8080"],
 		);
--- a/packages/pi-tui/src/components/editor.ts
+++ b/packages/pi-tui/src/components/editor.ts
@ -190,7 +190,7 @@ export class Editor implements Component, Focusable {
 	private autocompleteDebounceTimer: ReturnType<typeof setTimeout> | null =
 		null;
 	private lastAutocompleteLookupPrefix: string | null = null;
-	private static readonly AUTOCOMPLETE_DEBOUNCE_MS = 150;
+	private static readonly AUTOCOMPLETE_DEBOUNCE_MS = 50;

 	// Paste tracking for large pastes
 	private pastes: Map<number, string> = new Map();
--- a/scripts/ensure-workspace-builds.cjs
+++ b/scripts/ensure-workspace-builds.cjs
@ -94,6 +94,7 @@ if (require.main === module) {
 	const WORKSPACE_PACKAGES = [
 		"native",
 		"pi-tui",
+		"google-gemini-cli-provider",
 		"pi-ai",
 		"pi-agent-core",
 		"pi-coding-agent",
--- a/scripts/install-pi-global.js
+++ b/scripts/install-pi-global.js
@ -28,6 +28,7 @@ mkdirSync(piAgentDir, { recursive: true });
 const copied = [];
 if (copyDir("extensions")) copied.push("extensions");
 if (copyDir("skills")) copied.push("skills");
+if (copyDir("workflow-skills")) copied.push("workflow-skills");
 if (copyDir("agents")) copied.push("agents");

 const agentsMdSrc = join(resourcesDir, "AGENTS.md");
--- a/scripts/link-workspace-packages.cjs
+++ b/scripts/link-workspace-packages.cjs
@ -36,6 +36,7 @@ const scopeDir = join(root, "node_modules", scope);
 const packageDirs = [
 	"native",
 	"pi-agent-core",
+	"google-gemini-cli-provider",
 	"pi-ai",
 	"pi-coding-agent",
 	"pi-tui",
--- a/scripts/uninstall-pi-global.js
+++ b/scripts/uninstall-pi-global.js
@ -60,6 +60,7 @@ function removeIfContentMatches(targetPath, sourcePath, label) {

 removeResourceEntries("extensions");
 removeResourceEntries("skills");
+removeResourceEntries("workflow-skills");
 removeResourceEntries("agents");
 removeIfContentMatches(
 	join(piAgentDir, "AGENTS.md"),
--- a/src/resources/extensions/sf/commands/catalog.js
+++ b/src/resources/extensions/sf/commands/catalog.js
@ -69,6 +69,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
 	{ cmd: "queue", desc: "Queue and reorder future milestones" },
 	{ cmd: "quick", desc: "Execute a quick task without full planning overhead" },
 	{ cmd: "discuss", desc: "Discuss architecture and decisions" },
+	{ cmd: "steer", desc: "Steerable autonomous panel (Shift+Tab)" },
 	{ cmd: "capture", desc: "Fire-and-forget thought capture" },
 	{ cmd: "debug", desc: "Create and inspect persistent /debug sessions" },
 	{ cmd: "scan", desc: "Run source and project scans" },
--- a/src/resources/extensions/sf/index.js
+++ b/src/resources/extensions/sf/index.js
@ -19,6 +19,13 @@ export default async function registerExtension(pi) {
 	// tools, hooks) fails — e.g. due to a Windows-specific import error.
 	const { registerSFCommands } = await import("./commands/index.js");
 	registerSFCommands(pi);
+	
+	// Register steerable autonomous extension for Copilot Auto-style controls
+	const { default: steerableAutonomousExtension } = await import(
+		"./steerable-autonomous-extension.js"
+	);
+	steerableAutonomousExtension(pi);
+	
 	// Full setup (shortcuts, tools, hooks) in a separate try/catch so that
 	// any platform-specific load failure doesn't take out the core command.
 	try {
--- a/src/resources/extensions/sf/skills/directory.js
+++ b/src/resources/extensions/sf/skills/directory.js
@ -11,7 +11,9 @@ import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";

 const SKILL_FILENAME = "SKILL.md";
+export { SKILL_FILENAME };
 const USER_SKILL_DIR = join(process.env.HOME ?? "", ".sf", "skills");
+export { USER_SKILL_DIR };
 const BUNDLED_SKILL_DIR = join(
 	dirname(fileURLToPath(import.meta.url)),
 	"..",
@ -19,6 +21,15 @@ const BUNDLED_SKILL_DIR = join(
 	"..",
 	"skills",
 );
+export { BUNDLED_SKILL_DIR };
+const WORKFLOW_SKILL_DIR = join(
+	dirname(fileURLToPath(import.meta.url)),
+	"..",
+	"..",
+	"..",
+	"workflow-skills",
+);
+export { WORKFLOW_SKILL_DIR };

 /**
 * Find all skill directories under a base path.
@ -41,12 +52,12 @@ export function discoverSkillDirs(basePath) {
 }

 /**
- * Discover skills from all sources: project, user, and built-in.
+ * Discover skills from all sources: project, user, built-in, and workflow-internal.
 */
 export function discoverAllSkills(projectPath, options = {}) {
 	const sources = [];

-	// Bundled SF skills
+	// Bundled SF skills (user-facing, shown in /skills catalog)
 	if (options.includeBundled && existsSync(BUNDLED_SKILL_DIR)) {
 		const bundledSkills = discoverSkillDirsInRoot(BUNDLED_SKILL_DIR);
 		for (const s of bundledSkills) {
@ -54,6 +65,14 @@ export function discoverAllSkills(projectPath, options = {}) {
 		}
 	}

+	// Workflow-internal skills (hidden from users, injected by the runtime)
+	if (options.includeWorkflow !== false && existsSync(WORKFLOW_SKILL_DIR)) {
+		const workflowSkills = discoverSkillDirsInRoot(WORKFLOW_SKILL_DIR);
+		for (const s of workflowSkills) {
+			sources.push({ ...s, source: "workflow" });
+		}
+	}
+
 	// Project skills
 	if (projectPath) {
 		const projectSkills = discoverSkillDirs(projectPath);
--- a/src/resources/extensions/sf/skills/index.js
+++ b/src/resources/extensions/sf/skills/index.js
@ -18,6 +18,7 @@ export {
 	readSkillFile,
 	SKILL_FILENAME,
 	USER_SKILL_DIR,
+	WORKFLOW_SKILL_DIR,
 } from "./directory.js";
 export {
 	createEvalCase,
--- a/src/resources/extensions/sf/skills/loader.js
+++ b/src/resources/extensions/sf/skills/loader.js
@ -48,7 +48,7 @@ export function loadSkills(projectPath, options = {}) {
 		}

 		const validation =
-			source === "bundled"
+			source === "bundled" || source === "workflow"
 				? validateBundledSkillFrontmatter(parsed.frontmatter)
 				: validateSkillFrontmatter(parsed.frontmatter);
 		if (!validation.valid) {
@ -64,7 +64,10 @@ export function loadSkills(projectPath, options = {}) {
 		}

 		const record = buildSkillRecord(path, parsed.frontmatter, parsed.body);
-		if (
+		if (source === "workflow") {
+			// Workflow-internal skills are never user-invocable regardless of frontmatter
+			record.userInvocable = false;
+		} else if (
 			source === "bundled" &&
 			parsed.frontmatter["user-invocable"] === undefined
 		) {
@ -132,7 +135,8 @@ export function getPermittedSkills(skills, activeProfile) {
 */
 export function getUserInvocableSkills(skills) {
 	return skills.filter(
-		(s) => s.source === "bundled" && s.valid && s.userInvocable,
+		(s) =>
+			s.source !== "workflow" && s.source === "bundled" && s.valid && s.userInvocable,
 	);
 }

--- a/src/resources/workflow-skills/assumption-log/SKILL.md
+++ b/src/resources/workflow-skills/assumption-log/SKILL.md
@ -0,0 +1,92 @@
+---
+name: assumption-log
+description: Document assumptions, proceed with sensible defaults, surface for review at milestones. Use in research and planning workflows where context is incomplete. Blocks the "ask the user every 5 minutes" pattern and the "guess silently and break something" pattern. Every assumption becomes a named, reviewable artifact.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: normal
+triggers:
+  - plan
+  - research
+  - "*"
+---
+
+# Assumption Log
+
+## Iron Law
+
+```
+NEVER GUESS SILENTLY.
+NEVER ASK FOR EVERY MISSING DETAIL.
+DOCUMENT THE ASSUMPTION, PICK A SENSIBLE DEFAULT, SURFACE FOR REVIEW.
+```
+
+Silent guessing produces invisible errors. Asking for every missing detail breaks autonomous flow. The correct middle path: make the assumption explicit, pick a defensible default, continue, and surface the log at review gates.
+
+## Recognize Your Own Rationalizations
+
+- "I'll just ask the user." → Ask only when the decision is irreversible or the cost of a wrong assumption is high. For everything else: document and proceed.
+- "I know what they meant." → If you know, document the inference explicitly. If you don't know, document the assumption and the default you chose.
+- "It's obvious — I don't need to write it down." → What is obvious to you during planning is invisible to the reviewer and to your future self. Write it down.
+- "I'll address it when it comes up." → When it comes up, you won't remember what assumption you made. The log is the memory.
+
+## When to Run
+
+- At the start of any research or planning phase with incomplete context
+- When a planning decision depends on information that isn't in the codebase or spec
+- When a scope decision must be made without explicit instruction
+- Before each irreversible op (combine with `irreversible-ops` skill)
+
+## Assumption Entry Format
+
+For each assumption, record:
+
+```
+Assumption ID:   A-<NNN>
+Category:        <scope | design | dependency | behaviour | constraint>
+Statement:       <what you are assuming to be true>
+Basis:           <why this default was chosen — evidence, convention, or reasoning>
+Default chosen:  <the specific value, behaviour, or approach you will proceed with>
+Confidence:      <high | medium | low>
+Falsifier:       <what evidence would prove this assumption wrong>
+Review gate:     <at what milestone or checkpoint this should be surfaced>
+Impact if wrong: <what breaks if the assumption is incorrect>
+```
+
+**Confidence guidelines:**
+- `high` — strong evidence from code, docs, or established convention; probably correct
+- `medium` — inferred from partial evidence; plausible but should be confirmed
+- `low` — no evidence; pure default; must be confirmed before the affected code ships
+
+## Assumption Categories
+
+**Scope** — what is in/out of this task
+> "Assumption: the email notification feature is out of scope for this slice. Basis: spec says 'user profile update' with no mention of notifications. Default: skip. Review at slice completion."
+
+**Design** — how something should be structured
+> "Assumption: use SQLite for local state storage rather than JSON files. Basis: project uses SQLite everywhere else. Default: SQLite. Confidence: high."
+
+**Dependency** — which version, API, or external behaviour to rely on
+> "Assumption: the gateway API responds within 5 seconds. Basis: no SLA documented; 5s is standard for synchronous APIs. Default: 5s timeout. Confidence: medium."
+
+**Behaviour** — what the system should do in an edge case
+> "Assumption: on parse error, return empty array not null. Basis: existing code uses empty arrays for not-found cases. Default: []. Confidence: high."
+
+**Constraint** — limits on resources, permissions, or side effects
+> "Assumption: this migration is safe to run without a maintenance window. Basis: adds a nullable column, no lock required. Default: proceed without window. Confidence: medium. Falsifier: if table > 10M rows, lock time may matter."
+
+## Review Gate Protocol
+
+At each milestone or slice completion, surface all `medium` and `low` confidence assumptions:
+
+1. List all logged assumptions for the current slice
+2. Mark each: `CONFIRMED` (user or evidence validated it), `REVISED` (different default chosen), or `OPEN` (still unconfirmed)
+3. Any `low` confidence assumption that remains `OPEN` blocks slice completion
+4. Any `medium` confidence assumption that remains `OPEN` is a known risk — document it in the slice evidence
+
+## Completion Criteria
+
+- [ ] All assumptions made during the workflow are logged with full entry format
+- [ ] All `low` confidence assumptions are confirmed or revised before the slice ships
+- [ ] All `medium` confidence assumptions are surfaced at the milestone gate
+- [ ] The assumption log is attached to the slice/task artifacts in `.sf/active/{unit-id}/assumptions.md`
--- a/src/resources/workflow-skills/context-lean/SKILL.md
+++ b/src/resources/workflow-skills/context-lean/SKILL.md
@ -0,0 +1,116 @@
+---
+name: context-lean
+description: Prune context before each LLM call. Use in any multi-step workflow that accumulates context across iterations. Less but more relevant context produces better outputs. Prevents context bloat — the single biggest silent quality degrader in long autonomous runs.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: normal
+triggers:
+  - "*"
+---
+
+# Context Lean
+
+## Iron Law
+
+```
+CONTEXT IS A BUDGET, NOT A DUMP.
+EVERY TOKEN IN CONTEXT MUST EARN ITS PLACE.
+```
+
+Adding more context is not safer than adding less. Irrelevant context degrades output quality by diluting signal. When in doubt, leave it out.
+
+## Recognize Your Own Rationalizations
+
+- "More context can't hurt — it gives the model more to work with." → Wrong. Noise degrades recall. The model attends to everything; irrelevant context steals attention from relevant context.
+- "I'll include the whole file to be safe." → Include only the functions you're actually modifying. The rest is noise.
+- "I need to include the history so the model understands the situation." → Include the summary, not the transcript. Summaries are signal; raw transcripts are noise.
+- "The token limit isn't hit yet, so it's fine." → Token limits are not quality thresholds. Quality degrades well before the limit.
+
+## When to Run
+
+Before any LLM call in a multi-step workflow. Especially:
+- Before each autonomous iteration
+- Before a planning call that synthesizes many inputs
+- After completing a phase (prune phase artifacts before the next phase)
+- When the context window is more than 50% full
+
+## Skill Chain
+
+Inline skill. Run as a pre-call gate before each significant LLM invocation.
+
+```
+← prev:  any skill, before its LLM call
+→ next:  return to the invoking skill with pruned context
+```
+
+## Pruning Protocol
+
+Apply in order. Stop when the context is lean.
+
+### Step 1 — Remove completed work
+
+Anything that was needed to get to the current state but is not needed to proceed:
+- Completed task details (keep the summary, drop the steps)
+- Resolved errors (keep the fix, drop the stack trace)
+- Superseded plans (keep the current plan, drop the draft)
+
+### Step 2 — Summarize transcripts
+
+Raw conversation history is always worse than a summary. For any context block older than the current phase:
+1. Write a 3-5 sentence summary: what was decided, what was built, what failed
+2. Replace the transcript block with the summary
+3. Keep only the last 2-3 turns verbatim (for continuity)
+
+### Step 3 — Scope file content
+
+Never include entire files when you only need parts of them:
+- Include only the functions/methods being modified
+- Include only the test cases for the current behaviour
+- Include only the error output relevant to the current failure
+
+If a file must be included whole (e.g., a small config), it must be ≤ 50 lines or explicitly justified.
+
+### Step 4 — Audit includes
+
+For every block of context, ask: **if this were removed, would the model's output be worse?** If the answer is "maybe not," remove it.
+
+Keep:
+- The current task/goal (always)
+- The specific code being modified (always)
+- The error message or test failure driving the current step (always)
+- The contract/spec for the current slice (always)
+- Recent decisions that constrain the current step
+
+Remove:
+- Earlier phases' full output (summarize)
+- Files not touched in the current step
+- Passing test output (keep only failures)
+- Dependency documentation (link, don't include)
+- Comment threads and discussion (summarize conclusions)
+
+### Step 5 — Verify budget
+
+After pruning:
+- Context should fit in < 30% of the token budget for simple tasks, < 60% for complex ones
+- If still over budget after pruning, the task is too large for one call — split it
+
+## Context Composition Rules
+
+| Source | Include | Format |
+|--------|---------|--------|
+| Current task | Always | Full |
+| Current file being edited | Only changed functions | Snippet |
+| Current error / test failure | Always | Full |
+| Previous phase output | Summary only | 3-5 sentences |
+| Related file (not being edited) | Only the contract/signature | Snippet |
+| Conversation history | Last 2-3 turns + summary of rest | Mixed |
+| Documentation | Never inline | Reference by path |
+
+## Completion Criteria
+
+Context is lean when:
+- [ ] No completed phase artifacts in full (only summaries)
+- [ ] No entire files included when snippets suffice
+- [ ] Every included block answers "yes" to the audit question
+- [ ] Token budget is within target
--- a/src/resources/workflow-skills/error-routing/SKILL.md
+++ b/src/resources/workflow-skills/error-routing/SKILL.md
@ -0,0 +1,130 @@
+---
+name: error-routing
+description: Route errors by type, not severity. Use in any workflow with retry or error-handling steps. Maps error classes (transient, semantic, auth, infra, logic, contract) to their correct handlers. Prevents the two most common agent failure modes — retrying logic errors, and ignoring transient failures.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: normal
+triggers:
+  - build
+  - repair
+  - "*"
+---
+
+# Error Routing
+
+## Iron Law
+
+```
+ROUTE BY CLASS FIRST, SEVERITY SECOND.
+NEVER RETRY A LOGIC ERROR.
+NEVER ABANDON A TRANSIENT ERROR WITHOUT RETRY.
+```
+
+Retrying a logic error wastes time and can cause data corruption. Abandoning a transient error causes false failures. Routing by severity ("it's a 500, must be important") misclassifies both.
+
+## Recognize Your Own Rationalizations
+
+- "It failed, so I'll try a different approach." → Different approach to what? Classify the error first. A different approach to a transient failure is wrong — you need the same approach with a wait.
+- "It's a 500 error — must be a server problem." → HTTP 500s include logic errors, auth errors, and transient failures. Read the body.
+- "Let me retry with exponential backoff." → Exponential backoff is for transient errors only. Applying it to logic errors just slows down the failure.
+- "The test is flaky — I'll just retry it." → Flaky tests are infrastructure errors or race conditions. Classify and fix, don't retry blindly.
+
+## Error Class Taxonomy
+
+### Transient
+
+**Definition:** Will resolve without code change, given time or retry.
+
+**Examples:** network timeout, rate limit (429), service temporarily unavailable (503), lock contention, resource temporarily exhausted.
+
+**Handler:** Retry with wait. Use Retry-After header if present; otherwise exponential backoff (1s, 2s, 4s, max 30s). Max 3 retries. If still failing after 3 retries, escalate to infra error.
+
+**Do NOT:** change code, change approach, or report as a bug.
+
+---
+
+### Auth / Credential
+
+**Definition:** Request rejected due to missing or invalid credentials.
+
+**Examples:** 401, 403, expired token, invalid API key, insufficient permissions.
+
+**Handler:** Do NOT retry. Surface immediately with the exact credential or permission required. Never attempt to infer or work around missing auth — escalate to the human.
+
+**Do NOT:** retry, change approach, or attempt alternative auth methods.
+
+---
+
+### Logic / Contract
+
+**Definition:** Code does the wrong thing. The error is in the logic, not the environment.
+
+**Examples:** wrong output, failing assertion, type error, invariant violation, business rule violation, test failure (not flaky).
+
+**Handler:** Debug, find root cause, fix. Follow `systematic-debugging` skill protocol. Do NOT retry or use a workaround.
+
+**Do NOT:** retry, add a workaround, suppress the error.
+
+---
+
+### Infra / Environment
+
+**Definition:** The execution environment is broken in a way that requires external action.
+
+**Examples:** disk full, out of memory, missing required tool, corrupt DB, missing env var that cannot be inferred.
+
+**Handler:** Surface immediately. Describe exactly what is missing and what the minimum fix is. Do NOT attempt to work around infra failures in code.
+
+**Do NOT:** retry, assume it will resolve, add fallback code.
+
+---
+
+### Semantic / Integration
+
+**Definition:** Two components disagree on a contract — schema mismatch, API version mismatch, unexpected data shape.
+
+**Examples:** JSON parse error on valid-looking response, unexpected null where required, field name changed in dependency.
+
+**Handler:** Investigate the contract. Identify which side is wrong (caller or callee). Fix the contract mismatch, not the symptom.
+
+**Do NOT:** add nil-guards without understanding why the nil is there.
+
+---
+
+### Scope / Ambiguity
+
+**Definition:** Cannot proceed because the task is not well-defined enough to make a correct decision.
+
+**Examples:** conflicting requirements, missing spec, ambiguous acceptance criteria.
+
+**Handler:** Surface the ambiguity with the specific decision that is blocked. Follow `assumption-log` protocol — document the assumption, pick a sensible default, mark for review.
+
+**Do NOT:** guess silently.
+
+## Routing Decision Tree
+
+```
+Error occurs
+    │
+    ├─ Is it a network/rate-limit/timeout?  → TRANSIENT → retry with wait
+    │
+    ├─ Is it auth/403/401/credential?       → AUTH → surface, do not retry
+    │
+    ├─ Is it a test failure or wrong output? → LOGIC → debug + fix
+    │
+    ├─ Is the environment broken?           → INFRA → surface, external action needed
+    │
+    ├─ Is it a contract/schema mismatch?    → SEMANTIC → investigate contract
+    │
+    └─ Is the task underspecified?          → SCOPE → assumption-log protocol
+```
+
+## Completion Criteria
+
+For each error encountered in the workflow:
+- [ ] Error classified by type (not severity)
+- [ ] Handler applied per classification
+- [ ] Resolution recorded (what the error was, what fixed it)
+- [ ] No logic errors suppressed or worked around
+- [ ] No transient errors abandoned without retry
--- a/src/resources/workflow-skills/handoff-readability/SKILL.md
+++ b/src/resources/workflow-skills/handoff-readability/SKILL.md
@ -0,0 +1,132 @@
+---
+name: handoff-readability
+description: Enforce boring code, why-comments on non-obvious decisions, and clean interface contracts. Use in code-generation workflows. Makes rewrites cheap, reduces onboarding time, and prevents the "only the original author understands this" failure mode.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: normal
+triggers:
+  - build
+  - review
+  - "*"
+---
+
+# Handoff Readability
+
+## Iron Law
+
+```
+CODE IS READ 10X MORE THAN IT IS WRITTEN.
+WRITE FOR THE READER WHO HAS ZERO CONTEXT.
+BORING CODE IS A FEATURE.
+```
+
+Clever code that only the author can read is a liability. Every non-obvious decision is a future debugging session waiting to happen. Every missing comment on a "why" is a future misunderstanding that will produce a silent regression.
+
+## Recognize Your Own Rationalizations
+
+- "It's obvious what this does." → Obvious to you, now, with context. Not obvious at 2am during an incident to someone who didn't write it.
+- "Comments are noise." → Implementation comments are often noise. *Why* comments are always signal.
+- "The code is self-documenting." → Function names document *what*. Only comments document *why*.
+- "I'll clean it up later." → Later is when you're two milestones ahead and the context is gone. Clean it now.
+
+## When to Run
+
+- During code generation (inline, as you write)
+- During code review (check existing code for violations)
+- Before marking a slice complete (final readability pass)
+
+## The Three Rules
+
+### Rule 1: Boring over clever
+
+Prefer the solution a junior developer can read and modify. If you face a choice between:
+- An elegant one-liner and a readable 5-liner → use the 5-liner
+- A clever abstraction and a repeated-but-obvious pattern → repeat it until repetition is clearly worth abstracting
+- A performance micro-optimization and readable code → readable code, unless the performance requirement is proven
+
+**Exception:** performance-critical paths (must be documented with a benchmark that proves the optimization is necessary).
+
+### Rule 2: Why-comments on every non-obvious decision
+
+A comment is required when:
+- The code does something that looks wrong but is intentional
+- The code uses a non-standard approach for a reason
+- A value or constant was chosen for a specific reason (not arbitrary)
+- The code handles an edge case that isn't obvious from the types
+
+Format:
+```ts
+// WHY: <reason the non-obvious thing is correct>
+```
+
+Examples:
+```ts
+// WHY: SQLite WAL mode is required here — the default journal mode causes
+// write contention when multiple processes access the same DB file.
+db.pragma("journal_mode = WAL");
+
+// WHY: Retry up to 3 times with 1s backoff. The gateway has a 500ms cold-start
+// window after idle; the first call will often fail.
+const result = await retry(call, { times: 3, waitMs: 1000 });
+
+// WHY: Empty array not null — callers use .length checks without null guards.
+if (!data) return [];
+```
+
+### Rule 3: Clean interface contracts
+
+Every exported function needs a contract that answers:
+- **What does it return** (type + what null/undefined/empty means)
+- **What are the preconditions** (what must be true for it to work)
+- **What are the side effects** (writes, events, mutations)
+
+Bad:
+```ts
+export function processUser(user) { ... }
+```
+
+Good:
+```ts
+/**
+ * Validate and normalize a user record for DB insertion.
+ * Returns null if the record fails validation (caller decides whether to throw).
+ * Side effects: none. Pure function.
+ * Precondition: user.id must be a non-empty string.
+ */
+export function processUser(user: RawUser): NormalizedUser | null { ... }
+```
+
+## Rewrites-Cheap Test
+
+Before submitting a slice, ask:
+
+1. **Could a new team member understand each function without reading its callers?**  
+   If no → add why-comments or simplify.
+
+2. **Could the core logic be replaced without touching the interface?**  
+   If no → the interface is coupled to the implementation. Separate them.
+
+3. **Are there any "magic" values without a named constant and a why-comment?**  
+   If yes → name the constant and explain the value.
+
+4. **Does every exported symbol have a contract (JSDoc with purpose + consumer)?**  
+   If no → add it before marking the slice done.
+
+## Anti-Patterns
+
+| Pattern | Problem | Fix |
+|---------|---------|-----|
+| `// do the thing` | Describes what, not why | Replace with a why-comment or delete |
+| `const x = 42` | Magic number | `const MAX_RETRIES = 3; // WHY: ...` |
+| One-letter variables outside loops | Forces reader to track mental state | Use descriptive names |
+| Deeply nested conditionals | Hard to follow control flow | Extract to named functions |
+| Side effects in getters | Violates principle of least surprise | Separate reads from writes |
+
+## Completion Criteria
+
+- [ ] No magic values without named constants and why-comments
+- [ ] Every non-obvious decision has a `// WHY:` comment
+- [ ] Every exported symbol has a purpose + consumer JSDoc
+- [ ] Core logic is replaceable without changing the interface
+- [ ] A new team member can understand each function without external context
--- a/src/resources/workflow-skills/irreversible-ops/SKILL.md
+++ b/src/resources/workflow-skills/irreversible-ops/SKILL.md
@ -0,0 +1,96 @@
+---
+name: irreversible-ops
+description: Human-review gate for irreversible operations — deploys, database migrations, published artifact pushes, force pushes, and destructive deletes. Use in any workflow that touches infra, DB schema, or published artifacts. Classifies reversibility, injects a mandatory verification step, and blocks autonomous progression past the gate.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: trusted
+triggers:
+  - build
+  - repair
+  - "*"
+---
+
+# Irreversible Ops
+
+## Iron Law
+
+```
+BEFORE AN IRREVERSIBLE OP: STOP, CLASSIFY, GATE.
+NO AUTONOMOUS AGENT CROSSES AN IRREVERSIBLE BOUNDARY WITHOUT AN EXPLICIT HUMAN GATE.
+```
+
+An operation is irreversible if rolling it back requires more than running one command. If you are not certain, treat it as irreversible.
+
+## Recognize Your Own Rationalizations
+
+- "It's a dev environment — I can always recreate it." → Development data and schemas that are not in source control are irreversible. Assume production semantics until proven otherwise.
+- "The migration is small and I've done this before." → Size and familiarity do not reduce irreversibility. The gate is about the op class, not the op size.
+- "Autonomous mode is enabled, so I can proceed." → Autonomous mode governs pace and interaction style. It does not remove irreversibility gates.
+- "I'll add a rollback plan after." → Rollback plan comes first, before the gate can be passed.
+
+## Irreversible Op Classification
+
+### Class A — Always requires human gate
+
+| Operation | Why irreversible |
+|-----------|-----------------|
+| Database migration (schema change) | Column drops, type changes, constraint adds — data loss risk |
+| Published package version bump | npm/PyPI/GitHub Releases — cannot be un-published cleanly |
+| Force push to protected branch | Rewrites shared history |
+| Production deploy | Live traffic impact; rollback window may close |
+| Secret/credential rotation | Old credentials may already be in use |
+| Mass delete (files, records, buckets) | Data loss if incorrect |
+| External service configuration change | May affect other consumers |
+
+### Class B — Requires gate in autonomous mode, can proceed in assisted/manual
+
+| Operation | Condition |
+|-----------|-----------|
+| Database migration (data backfill) | If revert is a compensating migration |
+| Git tag creation | If CI/CD triggers on tags |
+| API endpoint removal | If consumers may exist |
+| Config change affecting behaviour | If not behind a feature flag |
+
+### Class C — No gate required
+
+- Adding new columns (no existing data affected)
+- Creating new tables
+- Adding new endpoints
+- Adding new feature flags (not yet enabled)
+- Writing tests
+- Modifying local dev config
+
+## Gate Protocol
+
+Before any Class A or Class B op, produce in writing:
+
+```
+Op class:        <A | B>
+Operation:       <exact description of what will happen>
+Affected scope:  <which data, which services, which users>
+Reversibility:   <how to undo this if it goes wrong — be specific>
+Rollback plan:   <exact command(s) to roll back>
+Verification:    <how will you know it succeeded?>
+Gate:            BLOCKED — requires human confirmation before proceeding
+```
+
+Do NOT proceed until the human confirms. "Confirmed" means explicit approval of the exact operation described above, not a general "go ahead."
+
+## Post-Gate Checklist
+
+After the human gate passes:
+- [ ] Backup taken (or confirmed unnecessary with reason)
+- [ ] Rollback plan is still valid
+- [ ] Monitoring/alerting is in place
+- [ ] Operation executed exactly as described in the gate record
+- [ ] Verification result recorded
+
+If the actual operation deviates from the gate description, stop and re-gate.
+
+## Completion Criteria
+
+- [ ] Every irreversible op in the workflow has been classified
+- [ ] All Class A ops have a gate record + human confirmation
+- [ ] All Class B ops in autonomous mode have a gate record + human confirmation
+- [ ] Post-gate checklist complete for each executed op
--- a/src/resources/workflow-skills/observe-first/SKILL.md
+++ b/src/resources/workflow-skills/observe-first/SKILL.md
@ -0,0 +1,119 @@
+---
+name: observe-first
+description: Enforce read-map-understand before any edit. Use at the start of any workflow that modifies existing code in an unfamiliar or partially-familiar codebase. Prevents the "Junior Refactor" failure mode — making changes without knowing what the code does or how it's used. Side-chain skill that gates the modify phase.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: normal
+triggers:
+  - build
+  - repair
+  - review
+  - "*"
+---
+
+# Observe First
+
+## Iron Law
+
+```
+NO EDIT WITHOUT A MENTAL MODEL.
+NO MENTAL MODEL WITHOUT EVIDENCE.
+```
+
+If you have not completed Phase 1 (Structure) and Phase 2 (Usage), you are not permitted to modify any file. The modification phase is blocked until both phases produce written findings.
+
+## Recognize Your Own Rationalizations
+
+These are the exact shortcuts you will reach for. Each is wrong:
+
+- "I can see what it does from the name." → Names lie. Read the body.
+- "I only need to change one line." → You don't know which one yet without reading the callers.
+- "I've seen this pattern before." → Familiarity is not analysis. This codebase may use the pattern differently.
+- "I'll figure it out as I go." → Going is the wrong order. Understand first, then go.
+- "The tests will catch mistakes." → Tests catch regressions you knew about. They don't catch structural misunderstandings.
+
+## When to Run
+
+- Any workflow that modifies existing code you haven't read end-to-end in this session.
+- Planning phases that require accurate impact analysis before choosing an approach.
+- Whenever the scope of a change is unclear.
+
+Do NOT skip this skill for "small" changes — small changes with wrong mental models cause the most silent bugs.
+
+## Skill Chain
+
+Side-chain gate. Blocks the modify phase until both observe phases complete.
+
+```
+← prev:  plan, spec-first-tdd, or any workflow beginning a modify phase
+→ next:  return to the invoking workflow once Phase 1 + Phase 2 are in writing
+```
+
+## Phase 1 — Structure Map
+
+Map the file/module being modified before touching it.
+
+```bash
+# Who owns the symbol?
+rg -n "export.*<symbol>|function <symbol>|class <symbol>" src/ packages/
+
+# What does the file do?
+cat <file> | head -80           # module header, imports, exports
+rg -n "export " <file>          # public surface
+
+# What are its dependencies?
+rg -n "^import " <file>         # what it imports
+rg -rn "from.*<module>" src/    # who imports this module
+```
+
+Produce written output:
+1. **Module purpose** — one sentence: why does this module exist?
+2. **Exports** — list every exported symbol and its type
+3. **Callers** — list every file that imports this module
+4. **Dependencies** — list what this module imports from elsewhere
+
+Do NOT proceed to Phase 2 until this list exists in writing.
+
+## Phase 2 — Usage Analysis
+
+For each symbol you intend to modify, trace how it is called.
+
+```bash
+# All call sites
+rg -n "<symbol>" src/ packages/ --type ts --type js
+
+# Test coverage
+rg -rn "<symbol>" src/ --include="*.test.*"
+
+# Recent history
+git log --oneline -10 -- <file>
+git log --oneline -10 -S "<symbol>"    # commits that changed this symbol
+```
+
+Produce written output for each symbol:
+1. **Call sites** — file:line for every caller, with the argument values passed
+2. **Contract** — what callers expect in return (inferred from usage)
+3. **Invariants** — what must be true before/after this symbol runs
+4. **Change blast radius** — which callers break if you change the signature or behaviour
+
+Do NOT write any code until this list exists in writing.
+
+## Phase 3 — Modification (Unblocked)
+
+Only after Phases 1 and 2 are documented:
+
+1. Make the **smallest** change that satisfies the contract.
+2. Keep changes inside the blast radius you mapped — no scope creep.
+3. If the blast radius is larger than expected, surface it before continuing.
+4. Update callers in the order dictated by the dependency map, not alphabetically.
+
+## Completion Criteria
+
+You may exit this skill and return to the invoking workflow when:
+
+- [ ] Phase 1 findings written (module purpose, exports, callers, deps)
+- [ ] Phase 2 findings written (call sites, contract, invariants, blast radius) for every symbol to be modified
+- [ ] The modification is bounded to the mapped blast radius
+
+If Phase 1 or Phase 2 reveals that the change is larger than originally scoped, **stop and surface the new scope** before modifying anything.
--- a/src/resources/workflow-skills/state-discipline/SKILL.md
+++ b/src/resources/workflow-skills/state-discipline/SKILL.md
@ -0,0 +1,134 @@
+---
+name: state-discipline
+description: Enforce structured, deterministic state management in long-running workflows. Use in any multi-step workflow that persists state across iterations. Prevents LLM-managed state, in-memory-only state, and unstructured file-based state — the three failure modes that cause autonomous loops to lose track of where they are.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: normal
+triggers:
+  - build
+  - plan
+  - "*"
+---
+
+# State Discipline
+
+## Iron Law
+
+```
+STATE LIVES IN SQLITE OR ON DISK AS STRUCTURED FILES.
+NEVER IN THE LLM'S CONTEXT WINDOW.
+NEVER IN MEMORY ACROSS STEPS.
+```
+
+Context-window state is lost on restart, summarization, and context compaction. In-memory state is lost on crash. Only SQLite and structured files survive restarts, crashes, and context rotation.
+
+## Recognize Your Own Rationalizations
+
+- "I'll track the progress in my context." → Context is summarized and truncated. Progress state in context is lost exactly when you need it most — after a crash or a long run.
+- "I'll use a JSON object in a variable." → In-memory variables don't survive the tool call boundary. Each tool invocation is a fresh execution context.
+- "It's simpler to just write to a text file." → Unstructured text files can't be queried, can't be joined, and produce parse errors under concurrent access. Use SQLite.
+- "I'll write the state management after the feature works." → State management is not a feature — it is the foundation. Without it, you can't resume, can't retry, and can't verify.
+
+## When to Run
+
+- Before designing any multi-step workflow that must survive restart
+- When a workflow has been running for more than 2 iterations
+- When implementing retry logic that requires tracking attempts
+- When implementing any lock, queue, or work-item pattern
+
+## The Four State Rules
+
+### Rule 1: SQLite for structured state
+
+Use `.sf/sf.db` (or a task-scoped DB) for any state with schema, ordering, priority, joins, or queries.
+
+**Use SQLite when:**
+- Tracking work items (pending/in-progress/done)
+- Recording retry counts
+- Storing key-value configuration that persists across steps
+- Any state that needs to be queried or filtered
+
+**Use structured files when:**
+- The state is a single document (a plan, a spec, an evidence file)
+- The state is append-only and never queried (logs)
+- The state must be human-readable and is the primary artifact
+
+**Never use:**
+- In-memory variables for state that crosses step boundaries
+- Free-form text files for state that needs to be queried
+- LLM context window for state that must survive restart
+
+### Rule 2: Schema before data
+
+Define the schema explicitly before inserting any rows. The schema is the contract:
+
+```sql
+CREATE TABLE IF NOT EXISTS workflow_units (
+    id TEXT PRIMARY KEY,
+    status TEXT NOT NULL DEFAULT 'pending'   -- pending | in_progress | done | blocked
+        CHECK(status IN ('pending','in_progress','done','blocked')),
+    created_at TEXT NOT NULL DEFAULT (datetime('now')),
+    updated_at TEXT NOT NULL DEFAULT (datetime('now')),
+    error TEXT                               -- last error if status = blocked
+);
+```
+
+Never add rows to an undefined table. Never use a table whose schema you haven't verified.
+
+### Rule 3: Atomic transitions
+
+State transitions must be atomic. Use SQLite transactions for multi-step transitions:
+
+```sql
+BEGIN;
+UPDATE workflow_units SET status = 'in_progress', updated_at = datetime('now')
+    WHERE id = :id AND status = 'pending';   -- conditional: only if still pending
+-- do the work
+UPDATE workflow_units SET status = 'done', updated_at = datetime('now')
+    WHERE id = :id;
+COMMIT;
+```
+
+Never set status to 'in_progress' in one statement and 'done' in another without a transaction — a crash between the two leaves inconsistent state.
+
+### Rule 4: Resume from state, not from memory
+
+Every workflow step must be resumable from the DB alone:
+
+```sql
+-- Find the next pending unit (resumable from cold start)
+SELECT * FROM workflow_units
+WHERE status = 'pending'
+    AND NOT EXISTS (
+        SELECT 1 FROM workflow_units dep
+        JOIN unit_deps d ON d.depends_on = dep.id
+        WHERE d.unit_id = workflow_units.id AND dep.status != 'done'
+    )
+ORDER BY priority DESC, created_at ASC
+LIMIT 1;
+```
+
+If you cannot reconstruct "where the workflow is" from a single SQL query, the state model is wrong.
+
+## State Inventory Checklist
+
+Before implementing a multi-step workflow, produce this inventory:
+
+```
+State item:      <what needs to be remembered>
+Lifetime:        <step | iteration | session | permanent>
+Schema:          <table + columns, or file path + format>
+Read pattern:    <how it is queried>
+Write pattern:   <when and how it is updated>
+Conflict rule:   <what happens if two processes write simultaneously>
+Recovery:        <how to detect and fix corrupt state>
+```
+
+## Completion Criteria
+
+- [ ] All cross-step state is in SQLite or structured files
+- [ ] Schema is defined before any data is written
+- [ ] All state transitions are atomic (transactions for multi-step)
+- [ ] The workflow is resumable from the DB alone after a cold restart
+- [ ] No state stored only in context or in-memory variables
--- a/src/resources/workflow-skills/vertical-slice/SKILL.md
+++ b/src/resources/workflow-skills/vertical-slice/SKILL.md
@ -0,0 +1,91 @@
+---
+name: vertical-slice
+description: Enforce end-to-end working increments at each workflow step. Use during planning and decomposition phases. Prevents "horizontal layers" — building all models, then all services, then all tests — which produces nothing shippable until the very end. Every slice must be testable and deployable in isolation.
+user-invocable: false
+model-invocable: true
+side-effects: none
+permission-profile: normal
+triggers:
+  - plan
+  - build
+  - "*"
+---
+
+# Vertical Slice
+
+## Iron Law
+
+```
+EVERY SLICE MUST BE INDEPENDENTLY TESTABLE AND DEPLOYABLE.
+NO SLICE IS DONE UNTIL ITS CONSUMER PATH WORKS END-TO-END.
+```
+
+A slice that produces "partial infrastructure" is not a slice — it is a layer. Layers are not shippable. If the slice cannot be verified in isolation, it is too large or wrongly cut.
+
+## Recognize Your Own Rationalizations
+
+- "I'll wire it up in the next slice." → If it can't be verified now, you can't confirm the first slice worked. Bugs compound invisibly.
+- "It's more efficient to build all the DB tables first." → It is more efficient to ship nothing until the very end. Horizontal layers guarantee integration surprises.
+- "The consumer isn't built yet." → Then build a stub consumer in this slice. The slice defines its own consumer path.
+- "I'll test it all together when it's complete." → "Together" is where integration bugs hide. Test each slice independently.
+
+## When to Run
+
+- Planning or decomposition: before breaking a milestone into tasks.
+- Slice review: before starting a new slice, confirm the previous one is truly end-to-end.
+- When an autonomous loop has been running for more than two slices without a shippable increment.
+
+## Skill Chain
+
+Planning-phase skill. Inline with the main delivery chain.
+
+```
+← prev:  architecture-planning, pm-planning, or any planning phase
+→ next:  spec-first-tdd (write the failing test for the first slice)
+```
+
+## Slice Definition Protocol
+
+For each slice, define **before writing any code**:
+
+```
+Slice ID:     <S01, S02, ...>
+Purpose:      <one sentence — why does this slice exist? what value does it add?>
+Entry point:  <the user-visible or API-visible surface that exercises this slice>
+Done state:   <exact observable behaviour that proves this slice is complete>
+Verifier:     <the command or test that confirms done state — must be runnable>
+Stub strategy: <if a dependency isn't built yet, what stub/fake makes this testable?>
+```
+
+A slice without a `Verifier` is not a valid slice. Stop and define one before proceeding.
+
+## Anti-Patterns to Detect and Reject
+
+| Pattern | Problem | Correct Cut |
+|---------|---------|-------------|
+| "Add all DB tables" | No consumer, not testable alone | "Add one table + one read + one test" |
+| "Build the service layer" | No entry point, no verifier | "Add one endpoint that returns real data from DB" |
+| "Implement the model" | Model without integration is not slice | "Add model + minimal handler + test that calls handler" |
+| "Set up infrastructure" | Infrastructure without behaviour is scaffolding | Include the first real use in the same slice |
+| "Refactor X" | Refactors with no consumer test are invisible | Include the test that proves behaviour unchanged |
+
+## Slice Sizing
+
+**Right-sized slice:** completes in a single autonomous iteration, has one clear verifier, can be described in one sentence.
+
+**Too large:** "Build the authentication system." Cut it: login endpoint → token validation → logout → password reset.
+
+**Too small:** "Add an import statement." Merge it with the first meaningful use.
+
+**Boundary check:** If a slice takes more than one session to complete, it is too large. Cut it.
+
+## Completion Criteria
+
+Each slice is done when:
+
+- [ ] `Verifier` command runs and passes
+- [ ] The consumer path works end-to-end (not "the model is ready")
+- [ ] No "temporary stubs" left in production paths (test stubs are fine)
+- [ ] The done state matches what was defined before coding started
+
+If the verifier passes but the done state wasn't defined upfront, you completed something — you just don't know what. Define done state first next time.