- Remove/prefix unused imports and variables across 11 src/ files to clear 74 diagnostics introduced by 37 subsequent commits since run #3 - Fix pre-existing timeout in web-mode-onboarding integration test: - Add timeoutMs: 120_000 to launchPackagedWebHost call (was unbounded) - Raise AbortSignal.timeout on simple fetches 10s → 30s (under parallel load) - Raise overall test timeout 180s → 420s (budget: 120+60+30+30+120+30=390s) - Log autoresearch run #4 and update lessons in autoresearch.md Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
234 lines
6.4 KiB
JavaScript
234 lines
6.4 KiB
JavaScript
/**
|
|
* Tests for auto-runaway-guard.ts — progress-check behavior.
|
|
*
|
|
* Purpose: verify the runaway guard does not hard-pause units that are
|
|
* making file-change progress, even when token growth would otherwise
|
|
* trigger a hard pause.
|
|
*
|
|
* Consumer: autonomous loop — evaluateRunawayGuard() controls whether a
|
|
* unit is warned, hard-paused, or allowed to continue.
|
|
*/
|
|
import assert from "node:assert/strict";
|
|
import { test } from "vitest";
|
|
import {
|
|
evaluateRunawayGuard,
|
|
resetRunawayGuardState,
|
|
} from "../auto-runaway-guard.js";
|
|
|
|
function makeConfig(overrides = {}) {
|
|
return {
|
|
enabled: true,
|
|
toolCallWarning: 60,
|
|
tokenWarning: 1_000_000,
|
|
elapsedMs: 20 * 60 * 1000,
|
|
changedFilesWarning: 75,
|
|
diagnosticTurns: 2,
|
|
hardPause: true,
|
|
minIntervalMs: 120_000,
|
|
...overrides,
|
|
};
|
|
}
|
|
|
|
function makeMetrics(overrides = {}) {
|
|
return {
|
|
toolCalls: 67,
|
|
sessionTokens: 2_940_000,
|
|
elapsedMs: 20 * 60 * 1000,
|
|
changedFiles: undefined,
|
|
worktreeFingerprint: null,
|
|
worktreeChangedSinceStart: undefined,
|
|
topTools: {},
|
|
...overrides,
|
|
};
|
|
}
|
|
|
|
function makeBaseline() {
|
|
return { sessionTokens: 0, changedFiles: 0, worktreeFingerprint: null };
|
|
}
|
|
|
|
// ── Progress-check tests ───────────────────────────────────────────────────────
|
|
|
|
test("progress check returns none regardless of hard-pause conditions", () => {
|
|
// Verify the progress check (changedFiles > 0) fires before the hard-pause
|
|
// block even when all other hard-pause conditions are met.
|
|
// This directly tests the fix: the progress guard short-circuits hard-pause.
|
|
resetRunawayGuardState("discuss-milestone", "M001", makeBaseline());
|
|
const config = makeConfig({ hardPause: true });
|
|
const now = Date.now();
|
|
|
|
// First call: finalWarningSent becomes true
|
|
evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001",
|
|
{
|
|
toolCalls: 67,
|
|
sessionTokens: 1_500_000,
|
|
elapsedMs: 22 * 60 * 1000,
|
|
changedFiles: 0,
|
|
worktreeFingerprint: null,
|
|
worktreeChangedSinceStart: false,
|
|
topTools: {},
|
|
},
|
|
config,
|
|
now,
|
|
);
|
|
|
|
// Second call: all hard-pause conditions are met (finalWarningSent=true, growth=true)
|
|
// BUT changedFiles > 0 → progress check fires first → returns 'none'
|
|
const r = evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001",
|
|
{
|
|
toolCalls: 67,
|
|
sessionTokens: 2_000_000,
|
|
elapsedMs: 25 * 60 * 1000,
|
|
changedFiles: 1,
|
|
worktreeFingerprint: null,
|
|
worktreeChangedSinceStart: false,
|
|
topTools: {},
|
|
},
|
|
config,
|
|
now + 180_000,
|
|
);
|
|
// The progress check fires BEFORE the hard-pause block, returning 'none'
|
|
assert.equal(
|
|
r.action,
|
|
"none",
|
|
"progress check should return none even when hardPause conditions are met",
|
|
);
|
|
});
|
|
|
|
test("returns none when changedFiles > 0 despite token growth and 2 warnings", () => {
|
|
// The core fix: units making file-change progress should not be hard-paused.
|
|
resetRunawayGuardState("discuss-milestone", "M001", makeBaseline());
|
|
const config = makeConfig();
|
|
const now = Date.now();
|
|
|
|
// First diagnostic turn
|
|
evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001",
|
|
makeMetrics({ sessionTokens: 1_500_000 }),
|
|
config,
|
|
now,
|
|
);
|
|
|
|
// Second call: tokens grew, but changedFiles > 0 → progress, not stuck
|
|
const r = evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001",
|
|
makeMetrics({ sessionTokens: 2_940_000, changedFiles: 1 }),
|
|
config,
|
|
now + 180_000,
|
|
);
|
|
assert.equal(r.action, "none", "should not pause when changedFiles > 0");
|
|
});
|
|
|
|
test("returns none when worktreeChangedSinceStart === true despite token growth", () => {
|
|
// The worktree fingerprint changed — dirty file content was modified.
|
|
// This is progress even with 0 new changed files.
|
|
resetRunawayGuardState("execute-task", "T01", makeBaseline());
|
|
const config = makeConfig();
|
|
const now = Date.now();
|
|
|
|
// First diagnostic turn
|
|
evaluateRunawayGuard(
|
|
"execute-task",
|
|
"T01",
|
|
makeMetrics({ sessionTokens: 1_500_000, changedFiles: 0 }),
|
|
config,
|
|
now,
|
|
);
|
|
|
|
// Second: tokens grew, worktree changed (dirty content), no new files
|
|
const r = evaluateRunawayGuard(
|
|
"execute-task",
|
|
"T01",
|
|
makeMetrics({
|
|
sessionTokens: 2_940_000,
|
|
changedFiles: 0,
|
|
worktreeChangedSinceStart: true,
|
|
}),
|
|
config,
|
|
now + 180_000,
|
|
);
|
|
assert.equal(
|
|
r.action,
|
|
"none",
|
|
"should not pause when worktreeChangedSinceStart === true",
|
|
);
|
|
});
|
|
|
|
test("returns none when changedFiles is explicitly 0 but worktreeChangedSinceStart is false", () => {
|
|
// No progress at all — this should NOT be caught by the progress check.
|
|
// It should proceed to the hard-pause logic.
|
|
resetRunawayGuardState("discuss-milestone", "M001", makeBaseline());
|
|
const config = makeConfig();
|
|
const now = Date.now();
|
|
|
|
evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001",
|
|
makeMetrics({ sessionTokens: 1_500_000, changedFiles: 0 }),
|
|
config,
|
|
now,
|
|
);
|
|
|
|
const r = evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001",
|
|
makeMetrics({
|
|
sessionTokens: 2_940_000,
|
|
changedFiles: 0,
|
|
worktreeChangedSinceStart: false,
|
|
}),
|
|
config,
|
|
now + 180_000,
|
|
);
|
|
// No progress check match, but hardPause conditions may or may not fire
|
|
// depending on hasMeaningfulGrowth — this test just ensures the progress
|
|
// check doesn't erroneously match when changedFiles IS 0
|
|
assert.ok(
|
|
r.action === "pause" || r.action === "warn" || r.action === "none",
|
|
`expected pause/warn/none, got ${r.action}`,
|
|
);
|
|
});
|
|
|
|
test("discuss-milestone with file changes does not get hard-paused", () => {
|
|
// Regression test for the exact SELF-FEEDBACK.md scenario:
|
|
// discuss-milestone was hard-paused with 2.94M tokens and 67 tool calls
|
|
// despite 1 new changed file and dirty file content modified.
|
|
resetRunawayGuardState("discuss-milestone", "M001-6377a4", makeBaseline());
|
|
const config = makeConfig();
|
|
const now = Date.now();
|
|
|
|
// First diagnostic turn (token growth begins)
|
|
evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001-6377a4",
|
|
makeMetrics({ sessionTokens: 1_500_000 }),
|
|
config,
|
|
now,
|
|
);
|
|
|
|
// Second call: exactly the scenario from SELF-FEEDBACK.md
|
|
// 2.94M tokens, 67 tool calls, 20min elapsed, 1 changed file
|
|
const r = evaluateRunawayGuard(
|
|
"discuss-milestone",
|
|
"M001-6377a4",
|
|
makeMetrics({
|
|
sessionTokens: 2_940_000,
|
|
toolCalls: 67,
|
|
elapsedMs: 20 * 60 * 1000,
|
|
changedFiles: 1,
|
|
worktreeChangedSinceStart: true, // dirty content changed
|
|
}),
|
|
config,
|
|
now + 180_000,
|
|
);
|
|
assert.equal(
|
|
r.action,
|
|
"none",
|
|
"discuss-milestone with file changes should not be paused",
|
|
);
|
|
});
|