singularity-forge/scripts/model-smoke-benchmark.mjs
Mikael Hugo 02a4339a51 refactor: rename pi-* packages to forge-native names (Phase 1)
Rename all four packages/pi-* directories to forge-native names,
stripping the 'pi' identity and establishing forge's own:

- packages/pi-coding-agent → packages/coding-agent
- packages/pi-ai → packages/ai
- packages/pi-agent-core → packages/agent-core
- packages/pi-tui → packages/tui

Package names updated:
- @singularity-forge/pi-coding-agent → @singularity-forge/coding-agent
- @singularity-forge/pi-ai → @singularity-forge/ai
- @singularity-forge/pi-agent-core → @singularity-forge/agent-core
- @singularity-forge/pi-tui → @singularity-forge/tui

All import references, bare string references, path references,
internal variable names (_bundledPi*), and dist files updated.
@mariozechner/pi-* third-party compat aliases preserved.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-10 11:28:01 +02:00

266 lines
6.8 KiB
JavaScript

#!/usr/bin/env node
import { spawnSync } from "node:child_process";
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { homedir } from "node:os";
import { dirname, resolve } from "node:path";
import { performance } from "node:perf_hooks";
const repoRoot = resolve(import.meta.dirname, "..");
const defaultOutputPath = resolve(
repoRoot,
".sf",
"model-benchmarks",
`${new Date().toISOString().replace(/[:.]/g, "-")}.json`,
);
const args = parseArgs(process.argv.slice(2));
const modelsArg = args.models ?? args.model;
const outputPath = resolve(args.output ?? defaultOutputPath);
const maxModels = Number.parseInt(
args.maxModels ?? args["max-models"] ?? "8",
10,
);
const maxTokens = Number.parseInt(
args.maxTokens ?? args["max-tokens"] ?? "420",
10,
);
await loadSfScopedEnv();
const { getModel, streamSimpleOpenAICompletions } = await import(
"../packages/ai/src/index.ts"
);
const modelIds = modelsArg
? modelsArg
.split(",")
.map((s) => s.trim())
.filter(Boolean)
: [
"kimi-coding/kimi-k2.6",
"minimax/MiniMax-M2.7-highspeed",
"zai/glm-4.5",
"mistral/devstral-latest",
"alibaba-coding-plan/qwen3-coder-plus",
"xiaomi/mimo-v2-pro",
"opencode-go/minimax-m2.7",
"openrouter/inclusionai/ling-2.6-1t:free",
];
const tasks = [
{
id: "json-repair",
maxTokens: Math.min(maxTokens, 280),
prompt: `Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }.
Broken payload: {"bug":"path traversal\\n- accepts ../foo","fix":123,"tests":"none"}.
Normalize it semantically; no markdown.`,
check: (text) => {
try {
const parsed = JSON.parse(text);
return (
typeof parsed.bug === "string" &&
typeof parsed.fix === "string" &&
Array.isArray(parsed.tests)
);
} catch {
return false;
}
},
},
{
id: "path-debug",
maxTokens,
prompt: `Find the bug and propose the minimal patch. Code:
function isSafe(base, target) {
const resolved = path.resolve(base, target)
return resolved.startsWith(base)
}
Explain why it is unsafe in <= 8 bullets, then provide a corrected JS function.`,
check: (text) =>
/startsWith|prefix/i.test(text) &&
/path\.sep|relative|normalize|resolve/i.test(text),
},
{
id: "routing-plan",
maxTokens,
prompt: `Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6.`,
check: (text) =>
/kimi-k2\.5/.test(text) &&
/kimi-k2\.6/.test(text) &&
/test|verify|validation/i.test(text),
},
];
const selectedModels = modelIds.slice(
0,
Number.isFinite(maxModels) ? maxModels : modelIds.length,
);
const results = [];
for (const fullId of selectedModels) {
const slash = fullId.indexOf("/");
if (slash === -1) {
results.push({
model: fullId,
ok: false,
error: "expected provider/model id",
});
continue;
}
const provider = fullId.slice(0, slash);
const modelId = fullId.slice(slash + 1);
const model = getModel(provider, modelId);
if (!model) {
results.push({
model: fullId,
ok: false,
error: "model not found in registry",
});
continue;
}
for (const task of tasks) {
const started = performance.now();
let text = "";
let result;
try {
const stream = streamSimpleOpenAICompletions(
model,
{
systemPrompt:
"You are a precise software engineering benchmark model. Follow requested output formats exactly.",
messages: [
{ role: "user", content: task.prompt, timestamp: Date.now() },
],
},
{ temperature: 0, maxTokens: task.maxTokens },
);
for await (const event of stream) {
if (event.type === "text_delta") text += event.delta;
}
result = await stream.result();
} catch (error) {
results.push({
model: fullId,
task: task.id,
ok: false,
elapsedMs: Math.round(performance.now() - started),
error: error instanceof Error ? error.message : String(error),
});
continue;
}
const elapsedMs = Math.round(performance.now() - started);
const passed = result.stopReason !== "error" && task.check(text);
results.push({
model: fullId,
task: task.id,
ok: passed,
stopReason: result.stopReason,
errorMessage: result.errorMessage,
elapsedMs,
chars: text.length,
usage: result.usage,
sample: text.slice(0, 700),
});
console.log(
`${passed ? "PASS" : "FAIL"} ${fullId} ${task.id} ${elapsedMs}ms ${result.stopReason}`,
);
}
}
const report = {
createdAt: new Date().toISOString(),
models: selectedModels,
tasks: tasks.map((t) => t.id),
results,
};
mkdirSync(dirname(outputPath), { recursive: true });
writeFileSync(outputPath, `${JSON.stringify(report, null, 2)}\n`);
console.log(`wrote ${outputPath}`);
function parseArgs(argv) {
const parsed = {};
for (let i = 0; i < argv.length; i++) {
const arg = argv[i];
if (!arg.startsWith("--")) continue;
const key = arg.slice(2);
const next = argv[i + 1];
if (!next || next.startsWith("--")) {
parsed[key] = "true";
} else {
parsed[key] = next;
i++;
}
}
return parsed;
}
async function loadSfScopedEnv() {
const secretsFile = `${homedir()}/.dotfiles/secrets/api-keys.yaml`;
const sopsConfig = `${homedir()}/.dotfiles/.sops.yaml`;
const wrapperPath = `${homedir()}/.local/bin/sf`;
const envNames = readSfScopedEnvNames(wrapperPath);
for (const name of envNames) delete process.env[name];
const decrypted = spawnSync(
"sops",
["--config", sopsConfig, "-d", secretsFile],
{
encoding: "utf8",
stdio: ["ignore", "pipe", "ignore"],
},
);
if (decrypted.status !== 0 || !decrypted.stdout) return;
const extracted = spawnSync(
"yq",
[
"-r",
`(
(.sf // {} | to_entries[]
| select((.value | type) == "string" or (.value | type) == "number" or (.value | type) == "boolean")
| select(.value != null and .value != "")
| "\\(.key)=\\(.value)"),
(.sf.env // {} | to_entries[]
| select(.value != null and .value != "")
| "\\(.key)=\\(.value)"),
(.sf.providers // {} | to_entries[]
| (.value.env // {})
| to_entries[]
| select(.value != null and .value != "")
| "\\(.key)=\\(.value)")
)`,
],
{
input: decrypted.stdout,
encoding: "utf8",
stdio: ["pipe", "pipe", "ignore"],
},
);
if (extracted.status !== 0 || !extracted.stdout) return;
for (const line of extracted.stdout.split(/\r?\n/)) {
const idx = line.indexOf("=");
if (idx <= 0) continue;
const key = line.slice(0, idx);
const value = line.slice(idx + 1);
if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(key) && value) process.env[key] = value;
}
}
function readSfScopedEnvNames(wrapperPath) {
try {
const source = readFileSync(wrapperPath, "utf8");
const match = source.match(/sf_scoped_env=\(\n([\s\S]*?)\n\)/);
if (!match) return [];
return match[1]
.split(/\r?\n/)
.map((line) => line.trim())
.filter((line) => /^[A-Z0-9_]+$/.test(line));
} catch {
return [];
}
}