sf snapshot: uncommitted changes after 131m inactivity

This commit is contained in:
Mikael Hugo 2026-05-09 02:53:47 +02:00
parent 5188b93ddc
commit 9875812c1b
44 changed files with 1149 additions and 87 deletions

View file

@ -0,0 +1,3 @@
{
"lastFullVacuumAt": "2026-05-08T20:15:21.317Z"
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,3 @@
{
"lastFullVacuumAt": "2026-05-08T20:29:49.200Z"
}

Binary file not shown.

15
package-lock.json generated
View file

@ -5708,6 +5708,10 @@
"node_modules/@singularity-forge/engine-win32-x64-msvc": {
"optional": true
},
"node_modules/@singularity-forge/google-gemini-cli-provider": {
"resolved": "packages/google-gemini-cli-provider",
"link": true
},
"node_modules/@singularity-forge/native": {
"resolved": "packages/native",
"link": true
@ -14618,6 +14622,16 @@
"url": "https://github.com/sponsors/colinhacks"
}
},
"packages/google-gemini-cli-provider": {
"name": "@singularity-forge/google-gemini-cli-provider",
"version": "2.75.3",
"dependencies": {
"@google/gemini-cli-core": "0.40.1"
},
"engines": {
"node": ">=26.1.0"
}
},
"packages/native": {
"name": "@singularity-forge/native",
"version": "2.75.3",
@ -14651,6 +14665,7 @@
"@google/genai": "^1.40.0",
"@mistralai/mistralai": "^2.2.1",
"@sinclair/typebox": "^0.34.41",
"@singularity-forge/google-gemini-cli-provider": "^2.75.3",
"ajv": "^8.17.1",
"ajv-formats": "^3.0.1",
"chalk": "^5.6.2",

View file

@ -48,7 +48,8 @@
"build:pi-coding-agent": "npm --workspace @singularity-forge/pi-coding-agent run build",
"build:native-pkg": "npm --workspace @singularity-forge/native run build",
"build:rpc-client": "npm --workspace @singularity-forge/rpc-client run build",
"build:pi": "npm run build:native-pkg && npm run build:pi-tui && npm run build:pi-ai && npm run build:pi-agent-core && npm run build:pi-coding-agent",
"build:google-gemini-cli-provider": "npm --workspace @singularity-forge/google-gemini-cli-provider run build",
"build:pi": "npm run build:native-pkg && npm run build:pi-tui && npm run build:google-gemini-cli-provider && npm run build:pi-ai && npm run build:pi-agent-core && npm run build:pi-coding-agent",
"build:daemon": "npm --workspace @singularity-forge/daemon run build",
"build:core": "npm run build:pi && npm run build:rpc-client && npm run build:daemon && npm run check:versioned-json && tsc && npm run copy-resources && npm run copy-themes && npm run copy-export-html",
"build": "npm run build:core && node scripts/build-web-if-stale.cjs",

View file

@ -0,0 +1,23 @@
{
"name": "@singularity-forge/google-gemini-cli-provider",
"version": "2.75.3",
"description": "Gemini CLI Core transport helper for SF providers",
"type": "module",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js"
}
},
"scripts": {
"build": "tsc -p tsconfig.json"
},
"dependencies": {
"@google/gemini-cli-core": "0.40.1"
},
"engines": {
"node": ">=26.1.0"
}
}

View file

@ -0,0 +1,40 @@
import assert from "node:assert/strict";
import { describe, test, vi } from "vitest";
const helperState = vi.hoisted(() => ({
authType: undefined as unknown,
configParams: undefined as Record<string, unknown> | undefined,
}));
vi.mock("@google/gemini-cli-core", () => ({
AuthType: { LOGIN_WITH_GOOGLE: "LOGIN_WITH_GOOGLE" },
makeFakeConfig: vi.fn((params: Record<string, unknown>) => {
helperState.configParams = params;
return { params };
}),
}));
vi.mock("@google/gemini-cli-core/dist/src/core/contentGenerator.js", () => ({
createContentGeneratorConfig: vi.fn(async (_config, authType) => {
helperState.authType = authType;
return { authType };
}),
createContentGenerator: vi.fn(async () => ({
async generateContentStream(): Promise<AsyncGenerator<unknown>> {
return (async function* emptyStream() {})();
},
})),
}));
import { createGeminiCliContentGenerator } from "./index.js";
describe("google-gemini-cli-provider", () => {
test("createGeminiCliContentGenerator_uses_google_login_auth", async () => {
await createGeminiCliContentGenerator({ modelId: "gemini-3-pro" });
assert.equal(helperState.authType, "LOGIN_WITH_GOOGLE");
assert.equal(helperState.configParams?.model, "gemini-3-pro");
assert.equal(helperState.configParams?.cwd, process.cwd());
assert.equal(helperState.configParams?.targetDir, process.cwd());
});
});

View file

@ -0,0 +1,48 @@
/**
* Google Gemini CLI transport helper.
*
* Purpose: keep the Gemini CLI Core auth and content-generator wiring in a
* dedicated workspace package so provider code can depend on one small helper
* instead of embedding the upstream integration inline.
*
* Consumer: `@singularity-forge/pi-ai` Google Gemini provider.
*/
import {
AuthType,
makeFakeConfig,
} from "@google/gemini-cli-core";
import {
createContentGenerator,
createContentGeneratorConfig,
type ContentGenerator,
} from "@google/gemini-cli-core/dist/src/core/contentGenerator.js";
export interface GeminiCliContentGeneratorOptions {
modelId: string;
cwd?: string;
targetDir?: string;
}
/**
* Create a Gemini CLI Core content generator for a model.
*
* Purpose: centralize the Code Assist setup and OAuth bootstrap logic in a
* reusable package so SF's Gemini provider can stay focused on stream shaping.
*
* Consumer: the Google Gemini provider in pi-ai.
*/
export async function createGeminiCliContentGenerator(
options: GeminiCliContentGeneratorOptions,
): Promise<ContentGenerator> {
const cwd = options.cwd ?? process.cwd();
const config = makeFakeConfig({
model: options.modelId,
cwd,
targetDir: options.targetDir ?? cwd,
});
const generatorConfig = await createContentGeneratorConfig(
config,
AuthType.LOGIN_WITH_GOOGLE,
);
return createContentGenerator(generatorConfig, config);
}

View file

@ -0,0 +1,28 @@
{
"compilerOptions": {
"target": "ES2024",
"module": "Node16",
"lib": ["ES2024"],
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"incremental": true,
"forceConsistentCasingInFileNames": true,
"declaration": true,
"declarationMap": true,
"sourceMap": true,
"inlineSources": true,
"inlineSourceMap": false,
"moduleResolution": "Node16",
"resolveJsonModule": true,
"allowImportingTsExtensions": false,
"experimentalDecorators": true,
"emitDecoratorMetadata": true,
"useDefineForClassFields": false,
"types": ["node"],
"outDir": "./dist",
"rootDir": "./src"
},
"include": ["src/**/*.ts"],
"exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"]
}

View file

@ -29,6 +29,7 @@
"@google/gemini-cli-core": "0.40.1",
"@google/genai": "^1.40.0",
"@mistralai/mistralai": "^2.2.1",
"@singularity-forge/google-gemini-cli-provider": "^2.75.3",
"@sinclair/typebox": "^0.34.41",
"ajv": "^8.17.1",
"ajv-formats": "^3.0.1",

View file

@ -5,41 +5,34 @@ import type { Context, Model } from "../types.js";
const geminiCliCore = vi.hoisted(() => ({
retryError: undefined as Error | undefined,
retryOptions: undefined as Record<string, unknown> | undefined,
fakeConfigParams: undefined as Record<string, unknown> | undefined,
generatorAuthType: undefined as unknown,
helperArgs: undefined as Record<string, unknown> | undefined,
}));
vi.mock("@google/gemini-cli-core", () => ({
AuthType: { LOGIN_WITH_GOOGLE: "LOGIN_WITH_GOOGLE" },
CodeAssistServer: class {
async generateContentStream(): Promise<AsyncGenerator<unknown>> {
return (async function* emptyStream() {})();
}
},
getOauthClient: vi.fn(async () => ({})),
makeFakeConfig: vi.fn((params: Record<string, unknown>) => {
geminiCliCore.fakeConfigParams = params;
return { params };
}),
retryWithBackoff: vi.fn(
async (_fn: unknown, options: Record<string, unknown>) => {
geminiCliCore.retryOptions = options;
throw geminiCliCore.retryError ?? new Error("quota exhausted");
},
),
setupUser: vi.fn(async () => ({ projectId: "test-project" })),
}));
vi.mock("@google/gemini-cli-core/dist/src/core/contentGenerator.js", () => ({
createContentGeneratorConfig: vi.fn(async (_config, authType) => {
geminiCliCore.generatorAuthType = authType;
return { authType };
}),
createContentGenerator: vi.fn(async () => ({
async generateContentStream(): Promise<AsyncGenerator<unknown>> {
return (async function* emptyStream() {})();
vi.mock("@singularity-forge/google-gemini-cli-provider", () => ({
createGeminiCliContentGenerator: vi.fn(
async (args: Record<string, unknown>) => {
geminiCliCore.helperArgs = args;
return {
async generateContentStream(): Promise<AsyncGenerator<unknown>> {
return (async function* emptyStream() {})();
},
};
},
})),
),
}));
import { streamGoogleGeminiCli } from "./google-gemini-cli.js";
@ -82,12 +75,7 @@ describe("google-gemini-cli provider retry ownership", () => {
| { maxAttempts?: unknown }
| undefined;
assert.equal(retryOptions?.maxAttempts, 1);
assert.equal(
geminiCliCore.fakeConfigParams?.model,
"gemini-3-flash-preview",
);
assert.equal(geminiCliCore.fakeConfigParams?.clientName, undefined);
assert.equal(geminiCliCore.generatorAuthType, "LOGIN_WITH_GOOGLE");
assert.equal(geminiCliCore.helperArgs?.modelId, "gemini-3-flash-preview");
assert.equal(result.stopReason, "error");
assert.match(result.errorMessage ?? "", /exhausted your capacity/i);
assert.equal(result.retryAfterMs, 54_000);

View file

@ -1,24 +1,12 @@
/**
* Google Gemini CLI provider.
*
* Delegates auth, project discovery, and the Code Assist transport to
* @google/gemini-cli-core the library behind Google's Gemini tooling.
* cli-core reads ~/.gemini/oauth_creds.json itself when present, refreshes tokens,
* discovers the project (free-tier or whatever's onboarded server-side)
* via setupUser(), and handles all the User-Agent / quota-classification details.
* Delegates auth, project discovery, and the Code Assist transport setup to
* the dedicated google-gemini-cli-provider package.
* Request retry/fallback stays in the caller so SF can move to the next model.
*/
import {
AuthType,
makeFakeConfig,
retryWithBackoff,
} from "@google/gemini-cli-core";
import type { ContentGenerator } from "@google/gemini-cli-core/dist/src/core/contentGenerator.js";
import {
createContentGenerator,
createContentGeneratorConfig,
} from "@google/gemini-cli-core/dist/src/core/contentGenerator.js";
import { retryWithBackoff } from "@google/gemini-cli-core";
import type {
Content,
GenerateContentParameters,
@ -55,6 +43,7 @@ import {
isAutoReasoning,
resolveReasoningLevel,
} from "./simple-options.js";
import { createGeminiCliContentGenerator } from "@singularity-forge/google-gemini-cli-provider";
/**
* Thinking level for Gemini 3 models.
@ -73,7 +62,8 @@ export type GoogleThinkingLevel =
/**
* Options for `streamGoogleGeminiCli()`.
*
* Delegates auth to cli-core (reads ~/.gemini/oauth_creds.json via `getOauthClient()`);
* Delegates auth to the helper package (reads ~/.gemini/oauth_creds.json via
* Gemini CLI Core's transport setup);
* `projectId` is auto-discovered and not used by this provider (apiKey is ignored).
* Thinking is configured separately from base `StreamOptions` because Gemini 2 and 3
* models use incompatible enum formats (budgetTokens vs. level).
@ -100,30 +90,6 @@ export interface GoogleGeminiCliOptions extends StreamOptions {
// Counter for generating unique tool call IDs
let toolCallCounter = 0;
/**
* Build a Code Assist content generator using cli-core's official content-generator path.
*
* Upstream Gemini CLI does not instantiate CodeAssistServer directly from the
* caller. It creates a ContentGeneratorConfig, lets createContentGenerator()
* build the GeminiCLI User-Agent and transport headers, then delegates to
* createCodeAssistContentGenerator() for OAuth, setupUser(), and Code Assist.
*
* Both calls memoize internally inside cli-core repeat invocations are
* cheap.
*/
async function getCodeAssistServer(modelId: string): Promise<ContentGenerator> {
const config = makeFakeConfig({
model: modelId,
cwd: process.cwd(),
targetDir: process.cwd(),
});
const generatorConfig = await createContentGeneratorConfig(
config,
AuthType.LOGIN_WITH_GOOGLE,
);
return createContentGenerator(generatorConfig, config);
}
function parseDurationMs(value: string): number | undefined {
const match = value.match(/(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?/i);
if (!match || !match[0]) return undefined;
@ -178,14 +144,14 @@ function isGemini3Model(modelId: string): boolean {
}
/**
* Stream a chat completion from Google Gemini via the cli-core transport.
* Stream a chat completion from Google Gemini via the helper package and cli-core transport.
*
* Auth is handled transparently by cli-core (`getCodeAssistServer()` reads OAuth creds from
* ~/.gemini/oauth_creds.json and triggers browser OAuth on first run). Project ID is auto-discovered
* from the Code Assist API; `apiKey` is ignored. Casting the request as `any` works around the fact
* that cli-core bundles its own nested `@google/genai` copy (nominal type split at packaging time;
* runtime shapes are byte-identical). Returns a real-time stream emitting start, delta, end, and
* error events that accumulate into an `AssistantMessage`.
* The helper package owns the OAuth/bootstrap path against `@google/gemini-cli-core`, including
* `~/.gemini/oauth_creds.json` and Gemini Code Assist project discovery. `apiKey` is ignored.
* Casting the request as `any` works around the fact that cli-core bundles its own nested
* `@google/genai` copy (nominal type split at packaging time; runtime shapes are byte-identical).
* Returns a real-time stream emitting start, delta, end, and error events that accumulate into
* an `AssistantMessage`.
*/
export const streamGoogleGeminiCli: StreamFunction<
"google-gemini-cli",
@ -222,9 +188,10 @@ export const streamGoogleGeminiCli: StreamFunction<
if (nextReq !== undefined) {
req = nextReq as GenerateContentParameters;
}
// cli-core handles auth + project discovery. SF uses cli-core directly
// and does not spawn a separate provider CLI process.
const server = await getCodeAssistServer(req.model);
// cli-core handles auth + project discovery through the helper package.
const server = await createGeminiCliContentGenerator({
modelId: req.model,
});
const promptId = `pi-${Date.now()}-${Math.random().toString(36).slice(2, 11)}`;
// Cast through `any` — cli-core bundles its own nested @google/genai copy,
// so TypeScript sees two structurally-identical-but-distinct Content types.
@ -233,7 +200,6 @@ export const streamGoogleGeminiCli: StreamFunction<
const streamGen = await retryWithBackoff(
() => server.generateContentStream(req as any, promptId, "USER" as any),
{
authType: AuthType.LOGIN_WITH_GOOGLE,
// SF owns cross-model fallback. Let cli-core classify quota errors,
// but do not let it hold the turn through its 10-attempt retry loop.
maxAttempts: 1,

View file

@ -5,15 +5,15 @@ import { parseArgs } from "./args.js";
describe("parseArgs", () => {
it("parses optional-value extension flags with implicit and explicit values", () => {
const extensionFlags = new Map([
["genai-proxy", { type: "string" as const, allowNoValue: true }],
["demo-flag", { type: "string" as const, allowNoValue: true }],
]);
const defaultFlagArgs = parseArgs(["--genai-proxy"], extensionFlags);
const explicitFlagArgs = parseArgs(["--genai-proxy=8080"], extensionFlags);
const defaultFlagArgs = parseArgs(["--demo-flag"], extensionFlags);
const explicitFlagArgs = parseArgs(["--demo-flag=8080"], extensionFlags);
assert.deepEqual(
[
defaultFlagArgs.unknownFlags.get("genai-proxy"),
explicitFlagArgs.unknownFlags.get("genai-proxy"),
defaultFlagArgs.unknownFlags.get("demo-flag"),
explicitFlagArgs.unknownFlags.get("demo-flag"),
],
[true, "8080"],
);

View file

@ -190,7 +190,7 @@ export class Editor implements Component, Focusable {
private autocompleteDebounceTimer: ReturnType<typeof setTimeout> | null =
null;
private lastAutocompleteLookupPrefix: string | null = null;
private static readonly AUTOCOMPLETE_DEBOUNCE_MS = 150;
private static readonly AUTOCOMPLETE_DEBOUNCE_MS = 50;
// Paste tracking for large pastes
private pastes: Map<number, string> = new Map();

View file

@ -94,6 +94,7 @@ if (require.main === module) {
const WORKSPACE_PACKAGES = [
"native",
"pi-tui",
"google-gemini-cli-provider",
"pi-ai",
"pi-agent-core",
"pi-coding-agent",

View file

@ -28,6 +28,7 @@ mkdirSync(piAgentDir, { recursive: true });
const copied = [];
if (copyDir("extensions")) copied.push("extensions");
if (copyDir("skills")) copied.push("skills");
if (copyDir("workflow-skills")) copied.push("workflow-skills");
if (copyDir("agents")) copied.push("agents");
const agentsMdSrc = join(resourcesDir, "AGENTS.md");

View file

@ -36,6 +36,7 @@ const scopeDir = join(root, "node_modules", scope);
const packageDirs = [
"native",
"pi-agent-core",
"google-gemini-cli-provider",
"pi-ai",
"pi-coding-agent",
"pi-tui",

View file

@ -60,6 +60,7 @@ function removeIfContentMatches(targetPath, sourcePath, label) {
removeResourceEntries("extensions");
removeResourceEntries("skills");
removeResourceEntries("workflow-skills");
removeResourceEntries("agents");
removeIfContentMatches(
join(piAgentDir, "AGENTS.md"),

View file

@ -69,6 +69,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
{ cmd: "queue", desc: "Queue and reorder future milestones" },
{ cmd: "quick", desc: "Execute a quick task without full planning overhead" },
{ cmd: "discuss", desc: "Discuss architecture and decisions" },
{ cmd: "steer", desc: "Steerable autonomous panel (Shift+Tab)" },
{ cmd: "capture", desc: "Fire-and-forget thought capture" },
{ cmd: "debug", desc: "Create and inspect persistent /debug sessions" },
{ cmd: "scan", desc: "Run source and project scans" },

View file

@ -19,6 +19,13 @@ export default async function registerExtension(pi) {
// tools, hooks) fails — e.g. due to a Windows-specific import error.
const { registerSFCommands } = await import("./commands/index.js");
registerSFCommands(pi);
// Register steerable autonomous extension for Copilot Auto-style controls
const { default: steerableAutonomousExtension } = await import(
"./steerable-autonomous-extension.js"
);
steerableAutonomousExtension(pi);
// Full setup (shortcuts, tools, hooks) in a separate try/catch so that
// any platform-specific load failure doesn't take out the core command.
try {

View file

@ -11,7 +11,9 @@ import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
const SKILL_FILENAME = "SKILL.md";
export { SKILL_FILENAME };
const USER_SKILL_DIR = join(process.env.HOME ?? "", ".sf", "skills");
export { USER_SKILL_DIR };
const BUNDLED_SKILL_DIR = join(
dirname(fileURLToPath(import.meta.url)),
"..",
@ -19,6 +21,15 @@ const BUNDLED_SKILL_DIR = join(
"..",
"skills",
);
export { BUNDLED_SKILL_DIR };
const WORKFLOW_SKILL_DIR = join(
dirname(fileURLToPath(import.meta.url)),
"..",
"..",
"..",
"workflow-skills",
);
export { WORKFLOW_SKILL_DIR };
/**
* Find all skill directories under a base path.
@ -41,12 +52,12 @@ export function discoverSkillDirs(basePath) {
}
/**
* Discover skills from all sources: project, user, and built-in.
* Discover skills from all sources: project, user, built-in, and workflow-internal.
*/
export function discoverAllSkills(projectPath, options = {}) {
const sources = [];
// Bundled SF skills
// Bundled SF skills (user-facing, shown in /skills catalog)
if (options.includeBundled && existsSync(BUNDLED_SKILL_DIR)) {
const bundledSkills = discoverSkillDirsInRoot(BUNDLED_SKILL_DIR);
for (const s of bundledSkills) {
@ -54,6 +65,14 @@ export function discoverAllSkills(projectPath, options = {}) {
}
}
// Workflow-internal skills (hidden from users, injected by the runtime)
if (options.includeWorkflow !== false && existsSync(WORKFLOW_SKILL_DIR)) {
const workflowSkills = discoverSkillDirsInRoot(WORKFLOW_SKILL_DIR);
for (const s of workflowSkills) {
sources.push({ ...s, source: "workflow" });
}
}
// Project skills
if (projectPath) {
const projectSkills = discoverSkillDirs(projectPath);

View file

@ -18,6 +18,7 @@ export {
readSkillFile,
SKILL_FILENAME,
USER_SKILL_DIR,
WORKFLOW_SKILL_DIR,
} from "./directory.js";
export {
createEvalCase,

View file

@ -48,7 +48,7 @@ export function loadSkills(projectPath, options = {}) {
}
const validation =
source === "bundled"
source === "bundled" || source === "workflow"
? validateBundledSkillFrontmatter(parsed.frontmatter)
: validateSkillFrontmatter(parsed.frontmatter);
if (!validation.valid) {
@ -64,7 +64,10 @@ export function loadSkills(projectPath, options = {}) {
}
const record = buildSkillRecord(path, parsed.frontmatter, parsed.body);
if (
if (source === "workflow") {
// Workflow-internal skills are never user-invocable regardless of frontmatter
record.userInvocable = false;
} else if (
source === "bundled" &&
parsed.frontmatter["user-invocable"] === undefined
) {
@ -132,7 +135,8 @@ export function getPermittedSkills(skills, activeProfile) {
*/
export function getUserInvocableSkills(skills) {
return skills.filter(
(s) => s.source === "bundled" && s.valid && s.userInvocable,
(s) =>
s.source !== "workflow" && s.source === "bundled" && s.valid && s.userInvocable,
);
}

View file

@ -0,0 +1,92 @@
---
name: assumption-log
description: Document assumptions, proceed with sensible defaults, surface for review at milestones. Use in research and planning workflows where context is incomplete. Blocks the "ask the user every 5 minutes" pattern and the "guess silently and break something" pattern. Every assumption becomes a named, reviewable artifact.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: normal
triggers:
- plan
- research
- "*"
---
# Assumption Log
## Iron Law
```
NEVER GUESS SILENTLY.
NEVER ASK FOR EVERY MISSING DETAIL.
DOCUMENT THE ASSUMPTION, PICK A SENSIBLE DEFAULT, SURFACE FOR REVIEW.
```
Silent guessing produces invisible errors. Asking for every missing detail breaks autonomous flow. The correct middle path: make the assumption explicit, pick a defensible default, continue, and surface the log at review gates.
## Recognize Your Own Rationalizations
- "I'll just ask the user." → Ask only when the decision is irreversible or the cost of a wrong assumption is high. For everything else: document and proceed.
- "I know what they meant." → If you know, document the inference explicitly. If you don't know, document the assumption and the default you chose.
- "It's obvious — I don't need to write it down." → What is obvious to you during planning is invisible to the reviewer and to your future self. Write it down.
- "I'll address it when it comes up." → When it comes up, you won't remember what assumption you made. The log is the memory.
## When to Run
- At the start of any research or planning phase with incomplete context
- When a planning decision depends on information that isn't in the codebase or spec
- When a scope decision must be made without explicit instruction
- Before each irreversible op (combine with `irreversible-ops` skill)
## Assumption Entry Format
For each assumption, record:
```
Assumption ID: A-<NNN>
Category: <scope | design | dependency | behaviour | constraint>
Statement: <what you are assuming to be true>
Basis: <why this default was chosen evidence, convention, or reasoning>
Default chosen: <the specific value, behaviour, or approach you will proceed with>
Confidence: <high | medium | low>
Falsifier: <what evidence would prove this assumption wrong>
Review gate: <at what milestone or checkpoint this should be surfaced>
Impact if wrong: <what breaks if the assumption is incorrect>
```
**Confidence guidelines:**
- `high` — strong evidence from code, docs, or established convention; probably correct
- `medium` — inferred from partial evidence; plausible but should be confirmed
- `low` — no evidence; pure default; must be confirmed before the affected code ships
## Assumption Categories
**Scope** — what is in/out of this task
> "Assumption: the email notification feature is out of scope for this slice. Basis: spec says 'user profile update' with no mention of notifications. Default: skip. Review at slice completion."
**Design** — how something should be structured
> "Assumption: use SQLite for local state storage rather than JSON files. Basis: project uses SQLite everywhere else. Default: SQLite. Confidence: high."
**Dependency** — which version, API, or external behaviour to rely on
> "Assumption: the gateway API responds within 5 seconds. Basis: no SLA documented; 5s is standard for synchronous APIs. Default: 5s timeout. Confidence: medium."
**Behaviour** — what the system should do in an edge case
> "Assumption: on parse error, return empty array not null. Basis: existing code uses empty arrays for not-found cases. Default: []. Confidence: high."
**Constraint** — limits on resources, permissions, or side effects
> "Assumption: this migration is safe to run without a maintenance window. Basis: adds a nullable column, no lock required. Default: proceed without window. Confidence: medium. Falsifier: if table > 10M rows, lock time may matter."
## Review Gate Protocol
At each milestone or slice completion, surface all `medium` and `low` confidence assumptions:
1. List all logged assumptions for the current slice
2. Mark each: `CONFIRMED` (user or evidence validated it), `REVISED` (different default chosen), or `OPEN` (still unconfirmed)
3. Any `low` confidence assumption that remains `OPEN` blocks slice completion
4. Any `medium` confidence assumption that remains `OPEN` is a known risk — document it in the slice evidence
## Completion Criteria
- [ ] All assumptions made during the workflow are logged with full entry format
- [ ] All `low` confidence assumptions are confirmed or revised before the slice ships
- [ ] All `medium` confidence assumptions are surfaced at the milestone gate
- [ ] The assumption log is attached to the slice/task artifacts in `.sf/active/{unit-id}/assumptions.md`

View file

@ -0,0 +1,116 @@
---
name: context-lean
description: Prune context before each LLM call. Use in any multi-step workflow that accumulates context across iterations. Less but more relevant context produces better outputs. Prevents context bloat — the single biggest silent quality degrader in long autonomous runs.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: normal
triggers:
- "*"
---
# Context Lean
## Iron Law
```
CONTEXT IS A BUDGET, NOT A DUMP.
EVERY TOKEN IN CONTEXT MUST EARN ITS PLACE.
```
Adding more context is not safer than adding less. Irrelevant context degrades output quality by diluting signal. When in doubt, leave it out.
## Recognize Your Own Rationalizations
- "More context can't hurt — it gives the model more to work with." → Wrong. Noise degrades recall. The model attends to everything; irrelevant context steals attention from relevant context.
- "I'll include the whole file to be safe." → Include only the functions you're actually modifying. The rest is noise.
- "I need to include the history so the model understands the situation." → Include the summary, not the transcript. Summaries are signal; raw transcripts are noise.
- "The token limit isn't hit yet, so it's fine." → Token limits are not quality thresholds. Quality degrades well before the limit.
## When to Run
Before any LLM call in a multi-step workflow. Especially:
- Before each autonomous iteration
- Before a planning call that synthesizes many inputs
- After completing a phase (prune phase artifacts before the next phase)
- When the context window is more than 50% full
## Skill Chain
Inline skill. Run as a pre-call gate before each significant LLM invocation.
```
← prev: any skill, before its LLM call
→ next: return to the invoking skill with pruned context
```
## Pruning Protocol
Apply in order. Stop when the context is lean.
### Step 1 — Remove completed work
Anything that was needed to get to the current state but is not needed to proceed:
- Completed task details (keep the summary, drop the steps)
- Resolved errors (keep the fix, drop the stack trace)
- Superseded plans (keep the current plan, drop the draft)
### Step 2 — Summarize transcripts
Raw conversation history is always worse than a summary. For any context block older than the current phase:
1. Write a 3-5 sentence summary: what was decided, what was built, what failed
2. Replace the transcript block with the summary
3. Keep only the last 2-3 turns verbatim (for continuity)
### Step 3 — Scope file content
Never include entire files when you only need parts of them:
- Include only the functions/methods being modified
- Include only the test cases for the current behaviour
- Include only the error output relevant to the current failure
If a file must be included whole (e.g., a small config), it must be ≤ 50 lines or explicitly justified.
### Step 4 — Audit includes
For every block of context, ask: **if this were removed, would the model's output be worse?** If the answer is "maybe not," remove it.
Keep:
- The current task/goal (always)
- The specific code being modified (always)
- The error message or test failure driving the current step (always)
- The contract/spec for the current slice (always)
- Recent decisions that constrain the current step
Remove:
- Earlier phases' full output (summarize)
- Files not touched in the current step
- Passing test output (keep only failures)
- Dependency documentation (link, don't include)
- Comment threads and discussion (summarize conclusions)
### Step 5 — Verify budget
After pruning:
- Context should fit in < 30% of the token budget for simple tasks, < 60% for complex ones
- If still over budget after pruning, the task is too large for one call — split it
## Context Composition Rules
| Source | Include | Format |
|--------|---------|--------|
| Current task | Always | Full |
| Current file being edited | Only changed functions | Snippet |
| Current error / test failure | Always | Full |
| Previous phase output | Summary only | 3-5 sentences |
| Related file (not being edited) | Only the contract/signature | Snippet |
| Conversation history | Last 2-3 turns + summary of rest | Mixed |
| Documentation | Never inline | Reference by path |
## Completion Criteria
Context is lean when:
- [ ] No completed phase artifacts in full (only summaries)
- [ ] No entire files included when snippets suffice
- [ ] Every included block answers "yes" to the audit question
- [ ] Token budget is within target

View file

@ -0,0 +1,130 @@
---
name: error-routing
description: Route errors by type, not severity. Use in any workflow with retry or error-handling steps. Maps error classes (transient, semantic, auth, infra, logic, contract) to their correct handlers. Prevents the two most common agent failure modes — retrying logic errors, and ignoring transient failures.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: normal
triggers:
- build
- repair
- "*"
---
# Error Routing
## Iron Law
```
ROUTE BY CLASS FIRST, SEVERITY SECOND.
NEVER RETRY A LOGIC ERROR.
NEVER ABANDON A TRANSIENT ERROR WITHOUT RETRY.
```
Retrying a logic error wastes time and can cause data corruption. Abandoning a transient error causes false failures. Routing by severity ("it's a 500, must be important") misclassifies both.
## Recognize Your Own Rationalizations
- "It failed, so I'll try a different approach." → Different approach to what? Classify the error first. A different approach to a transient failure is wrong — you need the same approach with a wait.
- "It's a 500 error — must be a server problem." → HTTP 500s include logic errors, auth errors, and transient failures. Read the body.
- "Let me retry with exponential backoff." → Exponential backoff is for transient errors only. Applying it to logic errors just slows down the failure.
- "The test is flaky — I'll just retry it." → Flaky tests are infrastructure errors or race conditions. Classify and fix, don't retry blindly.
## Error Class Taxonomy
### Transient
**Definition:** Will resolve without code change, given time or retry.
**Examples:** network timeout, rate limit (429), service temporarily unavailable (503), lock contention, resource temporarily exhausted.
**Handler:** Retry with wait. Use Retry-After header if present; otherwise exponential backoff (1s, 2s, 4s, max 30s). Max 3 retries. If still failing after 3 retries, escalate to infra error.
**Do NOT:** change code, change approach, or report as a bug.
---
### Auth / Credential
**Definition:** Request rejected due to missing or invalid credentials.
**Examples:** 401, 403, expired token, invalid API key, insufficient permissions.
**Handler:** Do NOT retry. Surface immediately with the exact credential or permission required. Never attempt to infer or work around missing auth — escalate to the human.
**Do NOT:** retry, change approach, or attempt alternative auth methods.
---
### Logic / Contract
**Definition:** Code does the wrong thing. The error is in the logic, not the environment.
**Examples:** wrong output, failing assertion, type error, invariant violation, business rule violation, test failure (not flaky).
**Handler:** Debug, find root cause, fix. Follow `systematic-debugging` skill protocol. Do NOT retry or use a workaround.
**Do NOT:** retry, add a workaround, suppress the error.
---
### Infra / Environment
**Definition:** The execution environment is broken in a way that requires external action.
**Examples:** disk full, out of memory, missing required tool, corrupt DB, missing env var that cannot be inferred.
**Handler:** Surface immediately. Describe exactly what is missing and what the minimum fix is. Do NOT attempt to work around infra failures in code.
**Do NOT:** retry, assume it will resolve, add fallback code.
---
### Semantic / Integration
**Definition:** Two components disagree on a contract — schema mismatch, API version mismatch, unexpected data shape.
**Examples:** JSON parse error on valid-looking response, unexpected null where required, field name changed in dependency.
**Handler:** Investigate the contract. Identify which side is wrong (caller or callee). Fix the contract mismatch, not the symptom.
**Do NOT:** add nil-guards without understanding why the nil is there.
---
### Scope / Ambiguity
**Definition:** Cannot proceed because the task is not well-defined enough to make a correct decision.
**Examples:** conflicting requirements, missing spec, ambiguous acceptance criteria.
**Handler:** Surface the ambiguity with the specific decision that is blocked. Follow `assumption-log` protocol — document the assumption, pick a sensible default, mark for review.
**Do NOT:** guess silently.
## Routing Decision Tree
```
Error occurs
├─ Is it a network/rate-limit/timeout? → TRANSIENT → retry with wait
├─ Is it auth/403/401/credential? → AUTH → surface, do not retry
├─ Is it a test failure or wrong output? → LOGIC → debug + fix
├─ Is the environment broken? → INFRA → surface, external action needed
├─ Is it a contract/schema mismatch? → SEMANTIC → investigate contract
└─ Is the task underspecified? → SCOPE → assumption-log protocol
```
## Completion Criteria
For each error encountered in the workflow:
- [ ] Error classified by type (not severity)
- [ ] Handler applied per classification
- [ ] Resolution recorded (what the error was, what fixed it)
- [ ] No logic errors suppressed or worked around
- [ ] No transient errors abandoned without retry

View file

@ -0,0 +1,132 @@
---
name: handoff-readability
description: Enforce boring code, why-comments on non-obvious decisions, and clean interface contracts. Use in code-generation workflows. Makes rewrites cheap, reduces onboarding time, and prevents the "only the original author understands this" failure mode.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: normal
triggers:
- build
- review
- "*"
---
# Handoff Readability
## Iron Law
```
CODE IS READ 10X MORE THAN IT IS WRITTEN.
WRITE FOR THE READER WHO HAS ZERO CONTEXT.
BORING CODE IS A FEATURE.
```
Clever code that only the author can read is a liability. Every non-obvious decision is a future debugging session waiting to happen. Every missing comment on a "why" is a future misunderstanding that will produce a silent regression.
## Recognize Your Own Rationalizations
- "It's obvious what this does." → Obvious to you, now, with context. Not obvious at 2am during an incident to someone who didn't write it.
- "Comments are noise." → Implementation comments are often noise. *Why* comments are always signal.
- "The code is self-documenting." → Function names document *what*. Only comments document *why*.
- "I'll clean it up later." → Later is when you're two milestones ahead and the context is gone. Clean it now.
## When to Run
- During code generation (inline, as you write)
- During code review (check existing code for violations)
- Before marking a slice complete (final readability pass)
## The Three Rules
### Rule 1: Boring over clever
Prefer the solution a junior developer can read and modify. If you face a choice between:
- An elegant one-liner and a readable 5-liner → use the 5-liner
- A clever abstraction and a repeated-but-obvious pattern → repeat it until repetition is clearly worth abstracting
- A performance micro-optimization and readable code → readable code, unless the performance requirement is proven
**Exception:** performance-critical paths (must be documented with a benchmark that proves the optimization is necessary).
### Rule 2: Why-comments on every non-obvious decision
A comment is required when:
- The code does something that looks wrong but is intentional
- The code uses a non-standard approach for a reason
- A value or constant was chosen for a specific reason (not arbitrary)
- The code handles an edge case that isn't obvious from the types
Format:
```ts
// WHY: <reason the non-obvious thing is correct>
```
Examples:
```ts
// WHY: SQLite WAL mode is required here — the default journal mode causes
// write contention when multiple processes access the same DB file.
db.pragma("journal_mode = WAL");
// WHY: Retry up to 3 times with 1s backoff. The gateway has a 500ms cold-start
// window after idle; the first call will often fail.
const result = await retry(call, { times: 3, waitMs: 1000 });
// WHY: Empty array not null — callers use .length checks without null guards.
if (!data) return [];
```
### Rule 3: Clean interface contracts
Every exported function needs a contract that answers:
- **What does it return** (type + what null/undefined/empty means)
- **What are the preconditions** (what must be true for it to work)
- **What are the side effects** (writes, events, mutations)
Bad:
```ts
export function processUser(user) { ... }
```
Good:
```ts
/**
* Validate and normalize a user record for DB insertion.
* Returns null if the record fails validation (caller decides whether to throw).
* Side effects: none. Pure function.
* Precondition: user.id must be a non-empty string.
*/
export function processUser(user: RawUser): NormalizedUser | null { ... }
```
## Rewrites-Cheap Test
Before submitting a slice, ask:
1. **Could a new team member understand each function without reading its callers?**
If no → add why-comments or simplify.
2. **Could the core logic be replaced without touching the interface?**
If no → the interface is coupled to the implementation. Separate them.
3. **Are there any "magic" values without a named constant and a why-comment?**
If yes → name the constant and explain the value.
4. **Does every exported symbol have a contract (JSDoc with purpose + consumer)?**
If no → add it before marking the slice done.
## Anti-Patterns
| Pattern | Problem | Fix |
|---------|---------|-----|
| `// do the thing` | Describes what, not why | Replace with a why-comment or delete |
| `const x = 42` | Magic number | `const MAX_RETRIES = 3; // WHY: ...` |
| One-letter variables outside loops | Forces reader to track mental state | Use descriptive names |
| Deeply nested conditionals | Hard to follow control flow | Extract to named functions |
| Side effects in getters | Violates principle of least surprise | Separate reads from writes |
## Completion Criteria
- [ ] No magic values without named constants and why-comments
- [ ] Every non-obvious decision has a `// WHY:` comment
- [ ] Every exported symbol has a purpose + consumer JSDoc
- [ ] Core logic is replaceable without changing the interface
- [ ] A new team member can understand each function without external context

View file

@ -0,0 +1,96 @@
---
name: irreversible-ops
description: Human-review gate for irreversible operations — deploys, database migrations, published artifact pushes, force pushes, and destructive deletes. Use in any workflow that touches infra, DB schema, or published artifacts. Classifies reversibility, injects a mandatory verification step, and blocks autonomous progression past the gate.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: trusted
triggers:
- build
- repair
- "*"
---
# Irreversible Ops
## Iron Law
```
BEFORE AN IRREVERSIBLE OP: STOP, CLASSIFY, GATE.
NO AUTONOMOUS AGENT CROSSES AN IRREVERSIBLE BOUNDARY WITHOUT AN EXPLICIT HUMAN GATE.
```
An operation is irreversible if rolling it back requires more than running one command. If you are not certain, treat it as irreversible.
## Recognize Your Own Rationalizations
- "It's a dev environment — I can always recreate it." → Development data and schemas that are not in source control are irreversible. Assume production semantics until proven otherwise.
- "The migration is small and I've done this before." → Size and familiarity do not reduce irreversibility. The gate is about the op class, not the op size.
- "Autonomous mode is enabled, so I can proceed." → Autonomous mode governs pace and interaction style. It does not remove irreversibility gates.
- "I'll add a rollback plan after." → Rollback plan comes first, before the gate can be passed.
## Irreversible Op Classification
### Class A — Always requires human gate
| Operation | Why irreversible |
|-----------|-----------------|
| Database migration (schema change) | Column drops, type changes, constraint adds — data loss risk |
| Published package version bump | npm/PyPI/GitHub Releases — cannot be un-published cleanly |
| Force push to protected branch | Rewrites shared history |
| Production deploy | Live traffic impact; rollback window may close |
| Secret/credential rotation | Old credentials may already be in use |
| Mass delete (files, records, buckets) | Data loss if incorrect |
| External service configuration change | May affect other consumers |
### Class B — Requires gate in autonomous mode, can proceed in assisted/manual
| Operation | Condition |
|-----------|-----------|
| Database migration (data backfill) | If revert is a compensating migration |
| Git tag creation | If CI/CD triggers on tags |
| API endpoint removal | If consumers may exist |
| Config change affecting behaviour | If not behind a feature flag |
### Class C — No gate required
- Adding new columns (no existing data affected)
- Creating new tables
- Adding new endpoints
- Adding new feature flags (not yet enabled)
- Writing tests
- Modifying local dev config
## Gate Protocol
Before any Class A or Class B op, produce in writing:
```
Op class: <A | B>
Operation: <exact description of what will happen>
Affected scope: <which data, which services, which users>
Reversibility: <how to undo this if it goes wrong be specific>
Rollback plan: <exact command(s) to roll back>
Verification: <how will you know it succeeded?>
Gate: BLOCKED — requires human confirmation before proceeding
```
Do NOT proceed until the human confirms. "Confirmed" means explicit approval of the exact operation described above, not a general "go ahead."
## Post-Gate Checklist
After the human gate passes:
- [ ] Backup taken (or confirmed unnecessary with reason)
- [ ] Rollback plan is still valid
- [ ] Monitoring/alerting is in place
- [ ] Operation executed exactly as described in the gate record
- [ ] Verification result recorded
If the actual operation deviates from the gate description, stop and re-gate.
## Completion Criteria
- [ ] Every irreversible op in the workflow has been classified
- [ ] All Class A ops have a gate record + human confirmation
- [ ] All Class B ops in autonomous mode have a gate record + human confirmation
- [ ] Post-gate checklist complete for each executed op

View file

@ -0,0 +1,119 @@
---
name: observe-first
description: Enforce read-map-understand before any edit. Use at the start of any workflow that modifies existing code in an unfamiliar or partially-familiar codebase. Prevents the "Junior Refactor" failure mode — making changes without knowing what the code does or how it's used. Side-chain skill that gates the modify phase.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: normal
triggers:
- build
- repair
- review
- "*"
---
# Observe First
## Iron Law
```
NO EDIT WITHOUT A MENTAL MODEL.
NO MENTAL MODEL WITHOUT EVIDENCE.
```
If you have not completed Phase 1 (Structure) and Phase 2 (Usage), you are not permitted to modify any file. The modification phase is blocked until both phases produce written findings.
## Recognize Your Own Rationalizations
These are the exact shortcuts you will reach for. Each is wrong:
- "I can see what it does from the name." → Names lie. Read the body.
- "I only need to change one line." → You don't know which one yet without reading the callers.
- "I've seen this pattern before." → Familiarity is not analysis. This codebase may use the pattern differently.
- "I'll figure it out as I go." → Going is the wrong order. Understand first, then go.
- "The tests will catch mistakes." → Tests catch regressions you knew about. They don't catch structural misunderstandings.
## When to Run
- Any workflow that modifies existing code you haven't read end-to-end in this session.
- Planning phases that require accurate impact analysis before choosing an approach.
- Whenever the scope of a change is unclear.
Do NOT skip this skill for "small" changes — small changes with wrong mental models cause the most silent bugs.
## Skill Chain
Side-chain gate. Blocks the modify phase until both observe phases complete.
```
← prev: plan, spec-first-tdd, or any workflow beginning a modify phase
→ next: return to the invoking workflow once Phase 1 + Phase 2 are in writing
```
## Phase 1 — Structure Map
Map the file/module being modified before touching it.
```bash
# Who owns the symbol?
rg -n "export.*<symbol>|function <symbol>|class <symbol>" src/ packages/
# What does the file do?
cat <file> | head -80 # module header, imports, exports
rg -n "export " <file> # public surface
# What are its dependencies?
rg -n "^import " <file> # what it imports
rg -rn "from.*<module>" src/ # who imports this module
```
Produce written output:
1. **Module purpose** — one sentence: why does this module exist?
2. **Exports** — list every exported symbol and its type
3. **Callers** — list every file that imports this module
4. **Dependencies** — list what this module imports from elsewhere
Do NOT proceed to Phase 2 until this list exists in writing.
## Phase 2 — Usage Analysis
For each symbol you intend to modify, trace how it is called.
```bash
# All call sites
rg -n "<symbol>" src/ packages/ --type ts --type js
# Test coverage
rg -rn "<symbol>" src/ --include="*.test.*"
# Recent history
git log --oneline -10 -- <file>
git log --oneline -10 -S "<symbol>" # commits that changed this symbol
```
Produce written output for each symbol:
1. **Call sites** — file:line for every caller, with the argument values passed
2. **Contract** — what callers expect in return (inferred from usage)
3. **Invariants** — what must be true before/after this symbol runs
4. **Change blast radius** — which callers break if you change the signature or behaviour
Do NOT write any code until this list exists in writing.
## Phase 3 — Modification (Unblocked)
Only after Phases 1 and 2 are documented:
1. Make the **smallest** change that satisfies the contract.
2. Keep changes inside the blast radius you mapped — no scope creep.
3. If the blast radius is larger than expected, surface it before continuing.
4. Update callers in the order dictated by the dependency map, not alphabetically.
## Completion Criteria
You may exit this skill and return to the invoking workflow when:
- [ ] Phase 1 findings written (module purpose, exports, callers, deps)
- [ ] Phase 2 findings written (call sites, contract, invariants, blast radius) for every symbol to be modified
- [ ] The modification is bounded to the mapped blast radius
If Phase 1 or Phase 2 reveals that the change is larger than originally scoped, **stop and surface the new scope** before modifying anything.

View file

@ -0,0 +1,134 @@
---
name: state-discipline
description: Enforce structured, deterministic state management in long-running workflows. Use in any multi-step workflow that persists state across iterations. Prevents LLM-managed state, in-memory-only state, and unstructured file-based state — the three failure modes that cause autonomous loops to lose track of where they are.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: normal
triggers:
- build
- plan
- "*"
---
# State Discipline
## Iron Law
```
STATE LIVES IN SQLITE OR ON DISK AS STRUCTURED FILES.
NEVER IN THE LLM'S CONTEXT WINDOW.
NEVER IN MEMORY ACROSS STEPS.
```
Context-window state is lost on restart, summarization, and context compaction. In-memory state is lost on crash. Only SQLite and structured files survive restarts, crashes, and context rotation.
## Recognize Your Own Rationalizations
- "I'll track the progress in my context." → Context is summarized and truncated. Progress state in context is lost exactly when you need it most — after a crash or a long run.
- "I'll use a JSON object in a variable." → In-memory variables don't survive the tool call boundary. Each tool invocation is a fresh execution context.
- "It's simpler to just write to a text file." → Unstructured text files can't be queried, can't be joined, and produce parse errors under concurrent access. Use SQLite.
- "I'll write the state management after the feature works." → State management is not a feature — it is the foundation. Without it, you can't resume, can't retry, and can't verify.
## When to Run
- Before designing any multi-step workflow that must survive restart
- When a workflow has been running for more than 2 iterations
- When implementing retry logic that requires tracking attempts
- When implementing any lock, queue, or work-item pattern
## The Four State Rules
### Rule 1: SQLite for structured state
Use `.sf/sf.db` (or a task-scoped DB) for any state with schema, ordering, priority, joins, or queries.
**Use SQLite when:**
- Tracking work items (pending/in-progress/done)
- Recording retry counts
- Storing key-value configuration that persists across steps
- Any state that needs to be queried or filtered
**Use structured files when:**
- The state is a single document (a plan, a spec, an evidence file)
- The state is append-only and never queried (logs)
- The state must be human-readable and is the primary artifact
**Never use:**
- In-memory variables for state that crosses step boundaries
- Free-form text files for state that needs to be queried
- LLM context window for state that must survive restart
### Rule 2: Schema before data
Define the schema explicitly before inserting any rows. The schema is the contract:
```sql
CREATE TABLE IF NOT EXISTS workflow_units (
id TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'pending' -- pending | in_progress | done | blocked
CHECK(status IN ('pending','in_progress','done','blocked')),
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
error TEXT -- last error if status = blocked
);
```
Never add rows to an undefined table. Never use a table whose schema you haven't verified.
### Rule 3: Atomic transitions
State transitions must be atomic. Use SQLite transactions for multi-step transitions:
```sql
BEGIN;
UPDATE workflow_units SET status = 'in_progress', updated_at = datetime('now')
WHERE id = :id AND status = 'pending'; -- conditional: only if still pending
-- do the work
UPDATE workflow_units SET status = 'done', updated_at = datetime('now')
WHERE id = :id;
COMMIT;
```
Never set status to 'in_progress' in one statement and 'done' in another without a transaction — a crash between the two leaves inconsistent state.
### Rule 4: Resume from state, not from memory
Every workflow step must be resumable from the DB alone:
```sql
-- Find the next pending unit (resumable from cold start)
SELECT * FROM workflow_units
WHERE status = 'pending'
AND NOT EXISTS (
SELECT 1 FROM workflow_units dep
JOIN unit_deps d ON d.depends_on = dep.id
WHERE d.unit_id = workflow_units.id AND dep.status != 'done'
)
ORDER BY priority DESC, created_at ASC
LIMIT 1;
```
If you cannot reconstruct "where the workflow is" from a single SQL query, the state model is wrong.
## State Inventory Checklist
Before implementing a multi-step workflow, produce this inventory:
```
State item: <what needs to be remembered>
Lifetime: <step | iteration | session | permanent>
Schema: <table + columns, or file path + format>
Read pattern: <how it is queried>
Write pattern: <when and how it is updated>
Conflict rule: <what happens if two processes write simultaneously>
Recovery: <how to detect and fix corrupt state>
```
## Completion Criteria
- [ ] All cross-step state is in SQLite or structured files
- [ ] Schema is defined before any data is written
- [ ] All state transitions are atomic (transactions for multi-step)
- [ ] The workflow is resumable from the DB alone after a cold restart
- [ ] No state stored only in context or in-memory variables

View file

@ -0,0 +1,91 @@
---
name: vertical-slice
description: Enforce end-to-end working increments at each workflow step. Use during planning and decomposition phases. Prevents "horizontal layers" — building all models, then all services, then all tests — which produces nothing shippable until the very end. Every slice must be testable and deployable in isolation.
user-invocable: false
model-invocable: true
side-effects: none
permission-profile: normal
triggers:
- plan
- build
- "*"
---
# Vertical Slice
## Iron Law
```
EVERY SLICE MUST BE INDEPENDENTLY TESTABLE AND DEPLOYABLE.
NO SLICE IS DONE UNTIL ITS CONSUMER PATH WORKS END-TO-END.
```
A slice that produces "partial infrastructure" is not a slice — it is a layer. Layers are not shippable. If the slice cannot be verified in isolation, it is too large or wrongly cut.
## Recognize Your Own Rationalizations
- "I'll wire it up in the next slice." → If it can't be verified now, you can't confirm the first slice worked. Bugs compound invisibly.
- "It's more efficient to build all the DB tables first." → It is more efficient to ship nothing until the very end. Horizontal layers guarantee integration surprises.
- "The consumer isn't built yet." → Then build a stub consumer in this slice. The slice defines its own consumer path.
- "I'll test it all together when it's complete." → "Together" is where integration bugs hide. Test each slice independently.
## When to Run
- Planning or decomposition: before breaking a milestone into tasks.
- Slice review: before starting a new slice, confirm the previous one is truly end-to-end.
- When an autonomous loop has been running for more than two slices without a shippable increment.
## Skill Chain
Planning-phase skill. Inline with the main delivery chain.
```
← prev: architecture-planning, pm-planning, or any planning phase
→ next: spec-first-tdd (write the failing test for the first slice)
```
## Slice Definition Protocol
For each slice, define **before writing any code**:
```
Slice ID: <S01, S02, ...>
Purpose: <one sentence why does this slice exist? what value does it add?>
Entry point: <the user-visible or API-visible surface that exercises this slice>
Done state: <exact observable behaviour that proves this slice is complete>
Verifier: <the command or test that confirms done state must be runnable>
Stub strategy: <if a dependency isn't built yet, what stub/fake makes this testable?>
```
A slice without a `Verifier` is not a valid slice. Stop and define one before proceeding.
## Anti-Patterns to Detect and Reject
| Pattern | Problem | Correct Cut |
|---------|---------|-------------|
| "Add all DB tables" | No consumer, not testable alone | "Add one table + one read + one test" |
| "Build the service layer" | No entry point, no verifier | "Add one endpoint that returns real data from DB" |
| "Implement the model" | Model without integration is not slice | "Add model + minimal handler + test that calls handler" |
| "Set up infrastructure" | Infrastructure without behaviour is scaffolding | Include the first real use in the same slice |
| "Refactor X" | Refactors with no consumer test are invisible | Include the test that proves behaviour unchanged |
## Slice Sizing
**Right-sized slice:** completes in a single autonomous iteration, has one clear verifier, can be described in one sentence.
**Too large:** "Build the authentication system." Cut it: login endpoint → token validation → logout → password reset.
**Too small:** "Add an import statement." Merge it with the first meaningful use.
**Boundary check:** If a slice takes more than one session to complete, it is too large. Cut it.
## Completion Criteria
Each slice is done when:
- [ ] `Verifier` command runs and passes
- [ ] The consumer path works end-to-end (not "the model is ready")
- [ ] No "temporary stubs" left in production paths (test stubs are fine)
- [ ] The done state matches what was defined before coding started
If the verifier passes but the done state wasn't defined upfront, you completed something — you just don't know what. Define done state first next time.