feat: add cross-provider fallback when rate/quota limits are hit (#125)

When all credentials for a provider are exhausted, the system now automatically falls back to the next available provider in a user-configured fallback chain. Higher-priority providers are restored automatically when their backoff expires. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-14 15:45:44 -05:00 · 2026-03-14 15:45:44 -05:00 · adca6901ec
commit adca6901ec
parent 7f0caffd65
8 changed files with 995 additions and 15 deletions
--- a/.plans/issue-125-provider-fallback.md
+++ b/.plans/issue-125-provider-fallback.md
@ -0,0 +1,380 @@
+# Issue #125: Provider Fallback When Multiple Providers Configured
+# Copyright (c) 2026 Jeremy McSpadden <jeremy@fluxlabs.net>
+
+## Overview
+
+Add cross-provider fallback so that when a provider hits rate/quota limits, the system
+automatically switches to another provider that serves an equivalent model (or a
+user-configured fallback chain of different models).
+
+## Current State
+
+The codebase already supports:
+- **Multi-credential per provider** — round-robin or session-sticky selection
+- **Per-credential backoff tracking** — rate_limit (30s), quota_exhausted (30min), server_error (20s)
+- **Credential rotation on error** — `markUsageLimitReached()` backs off one key and returns
+  whether another key exists for the same provider
+- **Retry with exponential backoff** — 3 retries, 2s/4s/8s delays
+- **Error classification** — quota_exhausted, rate_limit, server_error, unknown
+
+The gap: fallback only works within a single provider (multiple API keys). There is no
+mechanism to fall back to a *different provider* serving the same or equivalent model.
+
+---
+
+## Architecture
+
+### Phase 1: Fallback Chain Configuration & Storage
+
+**Goal:** Let users define ordered fallback chains that map a primary model to backup
+model+provider combos.
+
+#### 1.1 — Settings Schema (`settings-manager.ts`)
+
+Add a new top-level setting:
+
+```typescript
+interface FallbackChainEntry {
+  provider: string;       // e.g. "zai", "alibaba", "openai"
+  model: string;          // e.g. "glm-5", "claude-opus-4-6"
+  priority: number;       // lower = higher priority (1 = primary)
+}
+
+interface FallbackSettings {
+  enabled: boolean;                          // default: false
+  chains: Record<string, FallbackChainEntry[]>;  // keyed by chain name
+  // Example:
+  // "coding": [
+  //   { provider: "zai", model: "glm-5", priority: 1 },
+  //   { provider: "alibaba", model: "glm-5", priority: 2 },
+  //   { provider: "openai", model: "gpt-4.1", priority: 3 }
+  // ]
+}
+```
+
+**Files to modify:**
+- `packages/pi-coding-agent/src/core/settings-manager.ts` — add `getFallbackSettings()`,
+  `setFallbackChain()`, `removeFallbackChain()`, getter/setter for `fallback.enabled`
+
+#### 1.2 — Settings File Location
+
+Stored in the existing `~/.pi/agent/settings.json` under a new `fallback` key.
+
+#### 1.3 — CLI Configuration Commands
+
+Add subcommands to the existing settings CLI:
+- `pi settings fallback enable/disable`
+- `pi settings fallback add-chain <name> --provider <p> --model <m> --priority <n>`
+- `pi settings fallback remove-chain <name>`
+- `pi settings fallback list`
+
+**Files to modify:**
+- `packages/pi-coding-agent/src/cli/commands/settings.ts` (or equivalent CLI entry point)
+
+---
+
+### Phase 2: Provider-Level Backoff Tracking
+
+**Goal:** Track backoff state at the provider level (not just credential level) so the
+fallback system knows when an entire provider is unavailable.
+
+#### 2.1 — Extend AuthStorage (`auth-storage.ts`)
+
+Add a provider-level backoff map alongside the existing credential-level one:
+
+```typescript
+private providerBackoff: Map<string, number> = new Map();
+// Map<provider, backoffExpiresAt>
+```
+
+**New methods:**
+```typescript
+markProviderExhausted(provider: string, errorType: UsageLimitErrorType): void
+isProviderAvailable(provider: string): boolean
+getProviderBackoffRemaining(provider: string): number  // ms until available, 0 if available
+```
+
+**Logic:** When `markUsageLimitReached()` returns `false` (all credentials for a provider
+are backed off), also mark the provider itself as backed off with the longest remaining
+credential backoff duration.
+
+**Files to modify:**
+- `packages/pi-coding-agent/src/core/auth-storage.ts`
+
+---
+
+### Phase 3: Fallback Resolution Engine
+
+**Goal:** Given a current model+provider that just failed, find the next available
+fallback from the configured chain.
+
+#### 3.1 — FallbackResolver (`fallback-resolver.ts` — new file)
+
+```typescript
+// packages/pi-coding-agent/src/core/fallback-resolver.ts
+
+export interface FallbackResult {
+  model: Model<Api>;
+  reason: string;  // "quota_exhausted on zai, falling back to alibaba"
+}
+
+export class FallbackResolver {
+  constructor(
+    private settings: SettingsManager,
+    private authStorage: AuthStorage,
+    private modelRegistry: ModelRegistry,
+  ) {}
+
+  /**
+   * Find the next available fallback for the current model.
+   * Returns null if no fallback is configured or available.
+   */
+  async findFallback(
+    currentModel: Model<Api>,
+    errorType: UsageLimitErrorType,
+  ): Promise<FallbackResult | null> {
+    // 1. Check if fallback is enabled
+    // 2. Find chain(s) containing currentModel's provider+model
+    // 3. Sort by priority
+    // 4. Skip entries where provider is backed off
+    // 5. Skip entries without valid API keys
+    // 6. Return first available, or null
+  }
+
+  /**
+   * Find the chain a model belongs to.
+   */
+  findChainForModel(provider: string, modelId: string): FallbackChainEntry[] | null
+
+  /**
+   * Get the highest-priority available model from a chain.
+   * Used on session start to pick the best available model.
+   */
+  async getBestAvailable(chainName: string): Promise<FallbackResult | null>
+}
+```
+
+#### 3.2 — Model Equivalence
+
+For same-model cross-provider fallback (Phase 1 of the feature), the chain entries
+explicitly name the provider+model pairs. No automatic equivalence detection needed —
+the user defines what's equivalent.
+
+---
+
+### Phase 4: Integrate Fallback into Retry Flow
+
+**Goal:** When credential rotation fails (all keys for a provider exhausted), try the
+fallback chain before giving up or doing exponential backoff.
+
+#### 4.1 — Modify `_handleRetryableError()` (`agent-session.ts`)
+
+Current flow:
+```
+1. Classify error
+2. Try credential rotation within provider → if success, retry immediately
+3. If quota_exhausted and all backed off → give up
+4. Exponential backoff retry
+```
+
+New flow:
+```
+1. Classify error
+2. Try credential rotation within provider → if success, retry immediately
+3. ** Try provider fallback via FallbackResolver **
+   a. If fallback found → swap model on agent, retry immediately
+   b. Emit event: "fallback_provider_switch" with old/new provider info
+4. If quota_exhausted and no fallback → give up
+5. Exponential backoff retry
+```
+
+**Key changes in agent-session.ts (~lines 2317-2370):**
+
+```typescript
+// After credential rotation fails:
+if (!hasAlternate) {
+  const fallbackResult = await this.fallbackResolver?.findFallback(
+    this.agent.model,
+    errorType,
+  );
+
+  if (fallbackResult) {
+    // Swap to fallback model
+    this.agent.setModel(fallbackResult.model);
+    this._removeLastError();
+    this._emitEvent("auto_retry_start", {
+      attempt: this._retryAttempt + 1,
+      delayMs: 0,
+      reason: fallbackResult.reason,
+    });
+    await this.agent.continue();
+    return true;
+  }
+}
+```
+
+#### 4.2 — Agent Model Swapping
+
+The agent needs a method to swap its model mid-conversation:
+
+```typescript
+// agent.ts or agent-loop.ts
+setModel(model: Model<Api>): void {
+  this.config.model = model;
+  // Re-resolve API key for new provider
+}
+```
+
+**Important:** The API key must also be re-resolved since we're switching providers.
+The `getApiKey` callback in `AgentOptions` already takes a provider string, so this
+should work naturally.
+
+**Files to modify:**
+- `packages/pi-coding-agent/src/core/agent-session.ts`
+- `packages/pi-ai/src/agent.ts` or `packages/pi-ai/src/agent-loop.ts`
+
+---
+
+### Phase 5: Provider Restoration (Auto-Upgrade)
+
+**Goal:** When a higher-priority provider's backoff expires, switch back to it.
+
+#### 5.1 — Pre-Request Priority Check
+
+Before each LLM request, check if a higher-priority provider in the chain has become
+available again:
+
+```typescript
+// In agent-loop.ts streamAssistantResponse(), before calling streamFn:
+if (this.fallbackResolver) {
+  const bestAvailable = await this.fallbackResolver.getBestAvailable(currentChain);
+  if (bestAvailable && bestAvailable.model.provider !== currentModel.provider) {
+    // Upgrade back to higher-priority provider
+    this.setModel(bestAvailable.model);
+    this._emitEvent("fallback_provider_restored", { ... });
+  }
+}
+```
+
+#### 5.2 — Quota Reset Awareness (Future Enhancement)
+
+For now, rely on backoff expiry times. A future enhancement could:
+- Parse rate limit headers for reset timestamps
+- Store per-provider quota windows (5-hour, daily, weekly, monthly)
+- Predict when quota will restore based on usage patterns
+
+This is complex and should be a separate issue.
+
+---
+
+### Phase 6: User-Facing Events & UI
+
+**Goal:** Surface fallback activity to the user so they know what's happening.
+
+#### 6.1 — New Events
+
+```typescript
+type FallbackEvent =
+  | { type: "fallback_provider_switch"; from: string; to: string; reason: string }
+  | { type: "fallback_provider_restored"; provider: string; reason: string }
+  | { type: "fallback_chain_exhausted"; chain: string; reason: string }
+```
+
+#### 6.2 — TUI Integration
+
+Display a brief notification in the TUI when fallback occurs:
+- `⚡ Switched from zai/glm-5 → alibaba/glm-5 (rate limit)`
+- `✓ Restored to zai/glm-5 (quota available)`
+- `⚠ All providers in chain "coding" exhausted`
+
+**Files to modify:**
+- `packages/pi-tui/src/` — event handler for new fallback events
+- Status bar or notification area in the TUI
+
+---
+
+## Implementation Order
+
+| Step | Phase | Effort | Dependencies |
+|------|-------|--------|-------------|
+| 1    | Phase 1.1-1.2: Settings schema | Small | None |
+| 2    | Phase 2: Provider-level backoff | Small | None |
+| 3    | Phase 3: FallbackResolver | Medium | Steps 1, 2 |
+| 4    | Phase 4: Retry integration | Medium | Step 3 |
+| 5    | Phase 5.1: Auto-restoration | Small | Step 4 |
+| 6    | Phase 1.3: CLI commands | Small | Step 1 |
+| 7    | Phase 6: Events & UI | Small | Step 4 |
+
+Steps 1 and 2 can be done in parallel. Steps 6 and 7 can be done in parallel.
+
+---
+
+## Key Design Decisions
+
+### 1. Explicit chains vs automatic model equivalence
+**Decision:** Explicit user-configured chains.
+**Why:** Automatic equivalence is unreliable — models with the same name from different
+providers may have different capabilities, limits, or pricing. Users should explicitly
+opt in to which models they consider interchangeable.
+
+### 2. Where fallback sits in the retry flow
+**Decision:** After credential rotation, before exponential backoff.
+**Why:** Provider fallback is a better recovery than waiting and retrying the same
+exhausted provider. If the fallback also fails, exponential backoff still kicks in.
+
+### 3. Model swap vs new agent
+**Decision:** Swap model on existing agent mid-conversation.
+**Why:** Creating a new agent would lose conversation context. The agent's `streamFn`
+already accepts model as a parameter, and `getApiKey` resolves per-provider, so
+swapping is straightforward.
+
+### 4. Restoration strategy
+**Decision:** Check before each request (lazy check on backoff expiry).
+**Why:** No background timers needed. The cost of one `isProviderAvailable()` check
+per request is negligible. More sophisticated quota tracking can be added later.
+
+### 5. Scope of fallback
+**Decision:** Per-session, not per-agent-type (initially).
+**Why:** The issue mentions per-agent-type toggle, but the simpler initial implementation
+is a global fallback chain that applies to any session using a model in the chain.
+Per-agent-type scoping can be added by extending the chain config with an `agentTypes`
+filter.
+
+---
+
+## Risks & Mitigations
+
+| Risk | Impact | Mitigation |
+|------|--------|-----------|
+| Model swap mid-conversation changes behavior | Medium | Log the swap, let user disable fallback |
+| Different providers have different tool/feature support | High | Validate fallback model supports same API features before swapping |
+| Credential resolution race conditions | Low | Use existing file-lock mechanism in auth-storage |
+| Chain misconfiguration (nonexistent model) | Low | Validate chain entries on save, warn on invalid |
+| Backoff timing mismatch with actual quota reset | Medium | Conservative backoff defaults; Phase 5.2 for future improvement |
+
+---
+
+## Testing Strategy
+
+1. **Unit tests for FallbackResolver** — mock auth-storage and model-registry, test chain
+   resolution, priority ordering, backoff skipping
+2. **Unit tests for extended auth-storage** — provider-level backoff tracking
+3. **Integration test for retry flow** — simulate rate limit → credential fallback →
+   provider fallback → restoration
+4. **E2E test** — configure a chain, hit rate limit on provider A, verify automatic
+   switch to provider B
+5. **Settings tests** — validate chain CRUD operations, persistence, invalid input handling
+
+---
+
+## Files Summary
+
+| File | Action | Changes |
+|------|--------|---------|
+| `packages/pi-coding-agent/src/core/settings-manager.ts` | Modify | Add FallbackSettings types, getters/setters |
+| `packages/pi-coding-agent/src/core/auth-storage.ts` | Modify | Add provider-level backoff tracking |
+| `packages/pi-coding-agent/src/core/fallback-resolver.ts` | **New** | FallbackResolver class |
+| `packages/pi-coding-agent/src/core/agent-session.ts` | Modify | Integrate fallback into retry flow |
+| `packages/pi-ai/src/agent.ts` | Modify | Add `setModel()` method |
+| `packages/pi-coding-agent/src/cli/commands/settings.ts` | Modify | Add fallback CLI subcommands |
+| `packages/pi-tui/src/` | Modify | Fallback event display |
--- a/packages/pi-coding-agent/src/core/agent-session.ts
+++ b/packages/pi-coding-agent/src/core/agent-session.ts
@ -70,6 +70,7 @@ import {
 	wrapToolsWithExtensions,
 } from "./extensions/index.js";
 import type { BashExecutionMessage, CustomMessage } from "./messages.js";
+import { FallbackResolver } from "./fallback-resolver.js";
 import type { ModelRegistry } from "./model-registry.js";
 import { expandPromptTemplate, type PromptTemplate } from "./prompt-templates.js";
 import type { ResourceExtensionPaths, ResourceLoader } from "./resource-loader.js";
@ -120,7 +121,10 @@ export type AgentSessionEvent =
 			errorMessage?: string;
 	  }
 	| { type: "auto_retry_start"; attempt: number; maxAttempts: number; delayMs: number; errorMessage: string }
-	| { type: "auto_retry_end"; success: boolean; attempt: number; finalError?: string };
+	| { type: "auto_retry_end"; success: boolean; attempt: number; finalError?: string }
+	| { type: "fallback_provider_switch"; from: string; to: string; reason: string }
+	| { type: "fallback_provider_restored"; provider: string; reason: string }
+	| { type: "fallback_chain_exhausted"; reason: string };

 /** Listener function for agent session events */
 export type AgentSessionEventListener = (event: AgentSessionEvent) => void;
@ -267,6 +271,9 @@ export class AgentSession {
 	// Model registry for API key resolution
 	private _modelRegistry: ModelRegistry;

+	// Provider fallback resolver
+	private _fallbackResolver: FallbackResolver;
+
 	// Tool registry for extension getTools/setTools
 	private _toolRegistry: Map<string, AgentTool> = new Map();
 	private _toolPromptSnippets: Map<string, string> = new Map();
@ -284,6 +291,11 @@ export class AgentSession {
 		this._customTools = config.customTools ?? [];
 		this._cwd = config.cwd;
 		this._modelRegistry = config.modelRegistry;
+		this._fallbackResolver = new FallbackResolver(
+			this.settingsManager,
+			this._modelRegistry.authStorage,
+			this._modelRegistry,
+		);
 		this._extensionRunnerRef = config.extensionRunnerRef;
 		this._initialActiveToolNames = config.initialActiveToolNames;
 		this._baseToolsOverride = config.baseToolsOverride;
@ -303,6 +315,11 @@ export class AgentSession {
 		return this._modelRegistry;
 	}

+	/** Fallback resolver for cross-provider fallback */
+	get fallbackResolver(): FallbackResolver {
+		return this._fallbackResolver;
+	}
+
 	// =========================================================================
 	// Event Subscription
 	// =========================================================================
@ -868,6 +885,19 @@ export class AgentSession {
 			);
 		}

+		// Check if a higher-priority provider in the fallback chain has recovered
+		const restoration = await this._fallbackResolver.checkForRestoration(this.model);
+		if (restoration) {
+			const previousProvider = `${this.model.provider}/${this.model.id}`;
+			this.agent.setModel(restoration.model);
+			this.sessionManager.appendModelChange(restoration.model.provider, restoration.model.id);
+			this._emit({
+				type: "fallback_provider_restored",
+				provider: `${restoration.model.provider}/${restoration.model.id}`,
+				reason: `Restored from ${previousProvider}`,
+			});
+		}
+
 		// Validate API key
 		const apiKey = await this._modelRegistry.getApiKey(this.model, this.sessionId);
 		if (!apiKey) {
@ -2354,20 +2384,66 @@ export class AgentSession {
 				return true;
 			}

-			// All credentials are backed off. For quota-exhausted errors the backoff is very
-			// long (30+ min), so retrying immediately is futile and will only produce
-			// confusing secondary errors (e.g. "Authentication failed"). Give up now and
-			// surface the original quota error to the user.
-			if (errorType === "quota_exhausted") {
-				this._emit({
-					type: "auto_retry_end",
-					success: false,
-					attempt: this._retryAttempt,
-					finalError: message.errorMessage,
-				});
-				this._retryAttempt = 0;
-				this._resolveRetry();
-				return false;
+			// All credentials are backed off. Try cross-provider fallback before giving up.
+			if (isCredentialError) {
+				const fallbackResult = await this._fallbackResolver.findFallback(
+					this.model,
+					errorType,
+				);
+
+				if (fallbackResult) {
+					// Swap to fallback model — don't persist to settings
+					const previousProvider = this.model.provider;
+					this.agent.setModel(fallbackResult.model);
+					this.sessionManager.appendModelChange(fallbackResult.model.provider, fallbackResult.model.id);
+
+					// Remove error message from agent state
+					const msgs = this.agent.state.messages;
+					if (msgs.length > 0 && msgs[msgs.length - 1].role === "assistant") {
+						this.agent.replaceMessages(msgs.slice(0, -1));
+					}
+
+					this._emit({
+						type: "fallback_provider_switch",
+						from: `${previousProvider}/${this.model?.id}`,
+						to: `${fallbackResult.model.provider}/${fallbackResult.model.id}`,
+						reason: fallbackResult.reason,
+					});
+
+					this._emit({
+						type: "auto_retry_start",
+						attempt: this._retryAttempt + 1,
+						maxAttempts: settings.maxRetries,
+						delayMs: 0,
+						errorMessage: `${message.errorMessage} (${fallbackResult.reason})`,
+					});
+
+					// Retry immediately with fallback provider - don't increment _retryAttempt
+					setTimeout(() => {
+						this.agent.continue().catch(() => {
+							// Retry failed - will be caught by next agent_end
+						});
+					}, 0);
+
+					return true;
+				}
+
+				// No fallback available either
+				if (errorType === "quota_exhausted") {
+					this._emit({
+						type: "fallback_chain_exhausted",
+						reason: `All providers exhausted for ${this.model.provider}/${this.model.id}`,
+					});
+					this._emit({
+						type: "auto_retry_end",
+						success: false,
+						attempt: this._retryAttempt,
+						finalError: message.errorMessage,
+					});
+					this._retryAttempt = 0;
+					this._resolveRetry();
+					return false;
+				}
 			}
 		}

--- a/packages/pi-coding-agent/src/core/auth-storage.ts
+++ b/packages/pi-coding-agent/src/core/auth-storage.ts
@ -248,6 +248,13 @@ export class AuthStorage {
 	 */
 	private credentialBackoff: Map<string, Map<number, number>> = new Map();

+	/**
+	 * Provider-level backoff tracking.
+	 * Set when all credentials for a provider are backed off.
+	 * Map<provider, backoffExpiresAt>
+	 */
+	private providerBackoff: Map<string, number> = new Map();
+
 	private constructor(private storage: AuthStorageBackend) {
 		this.reload();
 	}
@ -398,6 +405,7 @@ export class AuthStorage {
 		delete this.data[provider];
 		this.providerRoundRobinIndex.delete(provider);
 		this.credentialBackoff.delete(provider);
+		this.providerBackoff.delete(provider);
 		this.persistProviderChange(provider, undefined);
 	}

@ -484,6 +492,43 @@ export class AuthStorage {
 		return true;
 	}

+	/**
+	 * Mark an entire provider as exhausted.
+	 * Called when all credentials for a provider are backed off.
+	 */
+	markProviderExhausted(provider: string, errorType: UsageLimitErrorType): void {
+		const backoffMs = getBackoffDuration(errorType);
+		this.providerBackoff.set(provider, Date.now() + backoffMs);
+	}
+
+	/**
+	 * Check if a provider is currently available (not backed off at provider level).
+	 */
+	isProviderAvailable(provider: string): boolean {
+		const expiresAt = this.providerBackoff.get(provider);
+		if (expiresAt === undefined) return true;
+		if (Date.now() >= expiresAt) {
+			this.providerBackoff.delete(provider);
+			return true;
+		}
+		return false;
+	}
+
+	/**
+	 * Get milliseconds remaining until provider backoff expires.
+	 * Returns 0 if provider is available.
+	 */
+	getProviderBackoffRemaining(provider: string): number {
+		const expiresAt = this.providerBackoff.get(provider);
+		if (expiresAt === undefined) return 0;
+		const remaining = expiresAt - Date.now();
+		if (remaining <= 0) {
+			this.providerBackoff.delete(provider);
+			return 0;
+		}
+		return remaining;
+	}
+
 	/**
 	 * Check if a credential index is currently backed off.
 	 */
--- a/packages/pi-coding-agent/src/core/fallback-resolver.test.ts
+++ b/packages/pi-coding-agent/src/core/fallback-resolver.test.ts
@ -0,0 +1,229 @@
+// GSD Provider Fallback Resolver Tests
+// Copyright (c) 2026 Jeremy McSpadden <jeremy@fluxlabs.net>
+
+import { describe, it, beforeEach, mock } from "node:test";
+import assert from "node:assert/strict";
+import { FallbackResolver } from "./fallback-resolver.js";
+import type { Api, Model } from "@gsd/pi-ai";
+import type { AuthStorage } from "./auth-storage.js";
+import type { ModelRegistry } from "./model-registry.js";
+import type { FallbackChainEntry, SettingsManager } from "./settings-manager.js";
+
+function createMockModel(provider: string, id: string): Model<Api> {
+	return {
+		id,
+		name: id,
+		api: "openai-completions" as Api,
+		provider,
+		baseUrl: `https://api.${provider}.com`,
+		reasoning: false,
+		input: ["text"],
+		cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+		contextWindow: 128000,
+		maxTokens: 16384,
+	} as Model<Api>;
+}
+
+const zaiModel = createMockModel("zai", "glm-5");
+const alibabaModel = createMockModel("alibaba", "glm-5");
+const openaiModel = createMockModel("openai", "gpt-4.1");
+
+const defaultChain: FallbackChainEntry[] = [
+	{ provider: "zai", model: "glm-5", priority: 1 },
+	{ provider: "alibaba", model: "glm-5", priority: 2 },
+	{ provider: "openai", model: "gpt-4.1", priority: 3 },
+];
+
+function createResolver(overrides?: {
+	enabled?: boolean;
+	isProviderAvailable?: (provider: string) => boolean;
+	hasAuth?: (provider: string) => boolean;
+	find?: (provider: string, modelId: string) => Model<Api> | undefined;
+}) {
+	const settingsManager = {
+		getFallbackSettings: () => ({
+			enabled: overrides?.enabled ?? true,
+			chains: { coding: defaultChain },
+		}),
+	} as unknown as SettingsManager;
+
+	const authStorage = {
+		markProviderExhausted: mock.fn(),
+		isProviderAvailable: overrides?.isProviderAvailable ?? (() => true),
+		hasAuth: overrides?.hasAuth ?? (() => true),
+	} as unknown as AuthStorage;
+
+	const modelRegistry = {
+		find: overrides?.find ?? ((provider: string, modelId: string) => {
+			if (provider === "zai" && modelId === "glm-5") return zaiModel;
+			if (provider === "alibaba" && modelId === "glm-5") return alibabaModel;
+			if (provider === "openai" && modelId === "gpt-4.1") return openaiModel;
+			return undefined;
+		}),
+	} as unknown as ModelRegistry;
+
+	return { resolver: new FallbackResolver(settingsManager, authStorage, modelRegistry), authStorage };
+}
+
+// ─── findFallback ────────────────────────────────────────────────────────────
+
+describe("FallbackResolver — findFallback", () => {
+	it("returns next available provider when current fails", async () => {
+		const { resolver } = createResolver();
+		const result = await resolver.findFallback(zaiModel, "quota_exhausted");
+
+		assert.notEqual(result, null);
+		assert.equal(result!.model.provider, "alibaba");
+		assert.equal(result!.model.id, "glm-5");
+		assert.equal(result!.chainName, "coding");
+	});
+
+	it("marks current provider as exhausted", async () => {
+		const { resolver, authStorage } = createResolver();
+		await resolver.findFallback(zaiModel, "rate_limit");
+
+		const fn = authStorage.markProviderExhausted as any;
+		assert.equal(fn.mock.calls.length, 1);
+		assert.equal(fn.mock.calls[0].arguments[0], "zai");
+		assert.equal(fn.mock.calls[0].arguments[1], "rate_limit");
+	});
+
+	it("skips backed-off providers", async () => {
+		const { resolver } = createResolver({
+			isProviderAvailable: (provider: string) => provider !== "alibaba",
+		});
+
+		const result = await resolver.findFallback(zaiModel, "quota_exhausted");
+
+		assert.notEqual(result, null);
+		assert.equal(result!.model.provider, "openai");
+		assert.equal(result!.model.id, "gpt-4.1");
+	});
+
+	it("returns null when all providers are backed off", async () => {
+		const { resolver } = createResolver({
+			isProviderAvailable: () => false,
+		});
+
+		const result = await resolver.findFallback(zaiModel, "quota_exhausted");
+		assert.equal(result, null);
+	});
+
+	it("returns null when fallback is disabled", async () => {
+		const { resolver } = createResolver({ enabled: false });
+		const result = await resolver.findFallback(zaiModel, "quota_exhausted");
+		assert.equal(result, null);
+	});
+
+	it("returns null when model is not in any chain", async () => {
+		const { resolver } = createResolver();
+		const unknownModel = createMockModel("unknown", "some-model");
+		const result = await resolver.findFallback(unknownModel, "quota_exhausted");
+		assert.equal(result, null);
+	});
+
+	it("skips providers without auth", async () => {
+		const { resolver } = createResolver({
+			hasAuth: (provider: string) => provider !== "alibaba",
+		});
+
+		const result = await resolver.findFallback(zaiModel, "quota_exhausted");
+
+		assert.notEqual(result, null);
+		assert.equal(result!.model.provider, "openai");
+	});
+
+	it("skips providers with no model in registry", async () => {
+		const { resolver } = createResolver({
+			find: (provider: string, modelId: string) => {
+				if (provider === "alibaba") return undefined;
+				if (provider === "openai" && modelId === "gpt-4.1") return openaiModel;
+				return undefined;
+			},
+		});
+
+		const result = await resolver.findFallback(zaiModel, "quota_exhausted");
+
+		assert.notEqual(result, null);
+		assert.equal(result!.model.provider, "openai");
+	});
+});
+
+// ─── checkForRestoration ─────────────────────────────────────────────────────
+
+describe("FallbackResolver — checkForRestoration", () => {
+	it("returns higher-priority provider when recovered", async () => {
+		const { resolver } = createResolver();
+		const result = await resolver.checkForRestoration(alibabaModel);
+
+		assert.notEqual(result, null);
+		assert.equal(result!.model.provider, "zai");
+		assert.equal(result!.model.id, "glm-5");
+	});
+
+	it("returns null when already at highest priority", async () => {
+		const { resolver } = createResolver();
+		const result = await resolver.checkForRestoration(zaiModel);
+		assert.equal(result, null);
+	});
+
+	it("returns null when higher-priority provider is still backed off", async () => {
+		const { resolver } = createResolver({
+			isProviderAvailable: (provider: string) => provider !== "zai",
+		});
+
+		const result = await resolver.checkForRestoration(alibabaModel);
+		assert.equal(result, null);
+	});
+
+	it("returns null when fallback is disabled", async () => {
+		const { resolver } = createResolver({ enabled: false });
+		const result = await resolver.checkForRestoration(alibabaModel);
+		assert.equal(result, null);
+	});
+});
+
+// ─── getBestAvailable ────────────────────────────────────────────────────────
+
+describe("FallbackResolver — getBestAvailable", () => {
+	it("returns highest-priority available provider", async () => {
+		const { resolver } = createResolver();
+		const result = await resolver.getBestAvailable("coding");
+
+		assert.notEqual(result, null);
+		assert.equal(result!.model.provider, "zai");
+	});
+
+	it("skips backed-off providers", async () => {
+		const { resolver } = createResolver({
+			isProviderAvailable: (provider: string) => provider !== "zai",
+		});
+
+		const result = await resolver.getBestAvailable("coding");
+
+		assert.notEqual(result, null);
+		assert.equal(result!.model.provider, "alibaba");
+	});
+
+	it("returns null for unknown chain", async () => {
+		const { resolver } = createResolver();
+		const result = await resolver.getBestAvailable("nonexistent");
+		assert.equal(result, null);
+	});
+});
+
+// ─── findChainsForModel ──────────────────────────────────────────────────────
+
+describe("FallbackResolver — findChainsForModel", () => {
+	it("finds chains containing a model", () => {
+		const { resolver } = createResolver();
+		const chains = resolver.findChainsForModel("zai", "glm-5");
+		assert.deepEqual(chains, ["coding"]);
+	});
+
+	it("returns empty array for model not in any chain", () => {
+		const { resolver } = createResolver();
+		const chains = resolver.findChainsForModel("unknown", "model");
+		assert.deepEqual(chains, []);
+	});
+});
--- a/packages/pi-coding-agent/src/core/fallback-resolver.ts
+++ b/packages/pi-coding-agent/src/core/fallback-resolver.ts
@ -0,0 +1,165 @@
+// GSD Provider Fallback Resolver
+// Copyright (c) 2026 Jeremy McSpadden <jeremy@fluxlabs.net>
+
+/**
+ * FallbackResolver - Cross-provider fallback when rate/quota limits are hit.
+ *
+ * When a provider's credentials are all exhausted, this resolver finds the next
+ * available provider+model from a user-configured fallback chain. It also handles
+ * restoration: checking if a higher-priority provider has recovered before each request.
+ */
+
+import type { Api, Model } from "@gsd/pi-ai";
+import type { AuthStorage, UsageLimitErrorType } from "./auth-storage.js";
+import type { ModelRegistry } from "./model-registry.js";
+import type { FallbackChainEntry, SettingsManager } from "./settings-manager.js";
+
+export interface FallbackResult {
+	model: Model<Api>;
+	chainName: string;
+	reason: string;
+}
+
+export class FallbackResolver {
+	constructor(
+		private settingsManager: SettingsManager,
+		private authStorage: AuthStorage,
+		private modelRegistry: ModelRegistry,
+	) {}
+
+	/**
+	 * Find the next available fallback for a model that just failed.
+	 * Searches all chains for entries matching the current model's provider+id,
+	 * then returns the next available entry with lower priority (higher number).
+	 *
+	 * @returns FallbackResult if a fallback is available, null otherwise
+	 */
+	async findFallback(
+		currentModel: Model<Api>,
+		errorType: UsageLimitErrorType,
+	): Promise<FallbackResult | null> {
+		const { enabled, chains } = this.settingsManager.getFallbackSettings();
+		if (!enabled) return null;
+
+		// Mark the current provider as exhausted at the provider level
+		this.authStorage.markProviderExhausted(currentModel.provider, errorType);
+
+		// Search all chains for one containing the current model
+		for (const [chainName, entries] of Object.entries(chains)) {
+			const currentIndex = entries.findIndex(
+				(e) => e.provider === currentModel.provider && e.model === currentModel.id,
+			);
+
+			if (currentIndex === -1) continue;
+
+			// Try entries after the current one (already sorted by priority)
+			const result = await this._findAvailableInChain(chainName, entries, currentIndex + 1);
+			if (result) return result;
+
+			// Wrap around: try entries before the current one
+			const wrapResult = await this._findAvailableInChain(chainName, entries, 0, currentIndex);
+			if (wrapResult) return wrapResult;
+		}
+
+		return null;
+	}
+
+	/**
+	 * Check if a higher-priority provider in the chain has recovered.
+	 * Called before each LLM request to restore the best available provider.
+	 *
+	 * @returns FallbackResult if a better provider is available, null if current is best
+	 */
+	async checkForRestoration(currentModel: Model<Api>): Promise<FallbackResult | null> {
+		const { enabled, chains } = this.settingsManager.getFallbackSettings();
+		if (!enabled) return null;
+
+		for (const [chainName, entries] of Object.entries(chains)) {
+			const currentIndex = entries.findIndex(
+				(e) => e.provider === currentModel.provider && e.model === currentModel.id,
+			);
+
+			if (currentIndex === -1) continue;
+
+			// Only check entries with higher priority (lower index = higher priority)
+			if (currentIndex === 0) continue; // Already at highest priority
+
+			const result = await this._findAvailableInChain(chainName, entries, 0, currentIndex);
+			if (result) {
+				return {
+					...result,
+					reason: `${result.model.provider}/${result.model.id} recovered, restoring from fallback`,
+				};
+			}
+		}
+
+		return null;
+	}
+
+	/**
+	 * Get the best available model from a named chain.
+	 * Useful for initial model selection.
+	 */
+	async getBestAvailable(chainName: string): Promise<FallbackResult | null> {
+		const { enabled, chains } = this.settingsManager.getFallbackSettings();
+		if (!enabled) return null;
+
+		const entries = chains[chainName];
+		if (!entries || entries.length === 0) return null;
+
+		return this._findAvailableInChain(chainName, entries, 0);
+	}
+
+	/**
+	 * Find the chain(s) a model belongs to.
+	 */
+	findChainsForModel(provider: string, modelId: string): string[] {
+		const { chains } = this.settingsManager.getFallbackSettings();
+		const result: string[] = [];
+
+		for (const [chainName, entries] of Object.entries(chains)) {
+			if (entries.some((e) => e.provider === provider && e.model === modelId)) {
+				result.push(chainName);
+			}
+		}
+
+		return result;
+	}
+
+	/**
+	 * Search a chain for the first available entry starting from startIndex.
+	 */
+	private async _findAvailableInChain(
+		chainName: string,
+		entries: FallbackChainEntry[],
+		startIndex: number,
+		endIndex?: number,
+	): Promise<FallbackResult | null> {
+		const end = endIndex ?? entries.length;
+
+		for (let i = startIndex; i < end; i++) {
+			const entry = entries[i];
+
+			// Check provider-level backoff
+			if (!this.authStorage.isProviderAvailable(entry.provider)) {
+				continue;
+			}
+
+			// Check if model exists in registry
+			const model = this.modelRegistry.find(entry.provider, entry.model);
+			if (!model) continue;
+
+			// Check if API key is available
+			const hasAuth = this.authStorage.hasAuth(entry.provider);
+			if (!hasAuth) continue;
+
+			return {
+				model,
+				chainName,
+				reason: `falling back to ${entry.provider}/${entry.model}`,
+			};
+		}
+
+		return null;
+	}
+}
--- a/packages/pi-coding-agent/src/core/index.ts
+++ b/packages/pi-coding-agent/src/core/index.ts
@ -12,6 +12,7 @@ export {
 	type SessionStats,
 } from "./agent-session.js";
 export { type BashExecutorOptions, type BashResult, executeBash, executeBashWithOperations } from "./bash-executor.js";
+export { FallbackResolver, type FallbackResult } from "./fallback-resolver.js";
 export type { CompactionResult } from "./compaction/index.js";
 export { createEventBus, type EventBus, type EventBusController } from "./event-bus.js";

--- a/packages/pi-coding-agent/src/core/settings-manager.ts
+++ b/packages/pi-coding-agent/src/core/settings-manager.ts
@ -68,6 +68,17 @@ export interface TaskIsolationSettings {
 	merge?: "patch" | "branch"; // default: "patch"
 }

+export interface FallbackChainEntry {
+	provider: string;
+	model: string;
+	priority: number;
+}
+
+export interface FallbackSettings {
+	enabled?: boolean; // default: false
+	chains?: Record<string, FallbackChainEntry[]>; // keyed by chain name
+}
+
 export type TransportSetting = Transport;

 /**
@ -122,6 +133,7 @@ export interface Settings {
 	async?: AsyncSettings;
 	bashInterceptor?: BashInterceptorSettings;
 	taskIsolation?: TaskIsolationSettings;
+	fallback?: FallbackSettings;
 }

 /** Deep merge settings: project/overrides take precedence, nested objects merge recursively */
@ -1010,4 +1022,58 @@ export class SettingsManager {
 	getTaskIsolationMerge(): "patch" | "branch" {
 		return this.settings.taskIsolation?.merge ?? "patch";
 	}
+
+	getFallbackEnabled(): boolean {
+		return this.settings.fallback?.enabled ?? false;
+	}
+
+	setFallbackEnabled(enabled: boolean): void {
+		if (!this.globalSettings.fallback) {
+			this.globalSettings.fallback = {};
+		}
+		this.globalSettings.fallback.enabled = enabled;
+		this.markModified("fallback", "enabled");
+		this.save();
+	}
+
+	getFallbackChains(): Record<string, FallbackChainEntry[]> {
+		return this.settings.fallback?.chains ?? {};
+	}
+
+	getFallbackChain(name: string): FallbackChainEntry[] | undefined {
+		return this.settings.fallback?.chains?.[name];
+	}
+
+	setFallbackChain(name: string, entries: FallbackChainEntry[]): void {
+		if (!this.globalSettings.fallback) {
+			this.globalSettings.fallback = {};
+		}
+		if (!this.globalSettings.fallback.chains) {
+			this.globalSettings.fallback.chains = {};
+		}
+		// Sort by priority
+		this.globalSettings.fallback.chains[name] = [...entries].sort((a, b) => a.priority - b.priority);
+		this.markModified("fallback");
+		this.save();
+	}
+
+	removeFallbackChain(name: string): boolean {
+		if (!this.globalSettings.fallback?.chains?.[name]) {
+			return false;
+		}
+		delete this.globalSettings.fallback.chains[name];
+		if (Object.keys(this.globalSettings.fallback.chains).length === 0) {
+			delete this.globalSettings.fallback.chains;
+		}
+		this.markModified("fallback");
+		this.save();
+		return true;
+	}
+
+	getFallbackSettings(): { enabled: boolean; chains: Record<string, FallbackChainEntry[]> } {
+		return {
+			enabled: this.getFallbackEnabled(),
+			chains: this.getFallbackChains(),
+		};
+	}
 }
--- a/packages/pi-coding-agent/src/modes/interactive/interactive-mode.ts
+++ b/packages/pi-coding-agent/src/modes/interactive/interactive-mode.ts
@ -2382,6 +2382,24 @@ export class InteractiveMode {
 				this.ui.requestRender();
 				break;
 			}
+
+			case "fallback_provider_switch": {
+				this.showStatus(`Switched from ${event.from} → ${event.to} (${event.reason})`);
+				this.ui.requestRender();
+				break;
+			}
+
+			case "fallback_provider_restored": {
+				this.showStatus(`Restored to ${event.provider}`);
+				this.ui.requestRender();
+				break;
+			}
+
+			case "fallback_chain_exhausted": {
+				this.showError(event.reason);
+				this.ui.requestRender();
+				break;
+			}
 		}
 	}