singularity-forge/packages/pi-ai/src/utils/repair-tool-json.ts
2026-04-15 13:38:15 +02:00

220 lines
7.1 KiB
TypeScript

/**
* Repair malformed JSON in LLM tool-call arguments.
*
* LLMs sometimes copy YAML template formatting into JSON tool arguments,
* producing patterns like:
*
* "keyDecisions": - Used Web Notification API...,
* "keyFiles": - src-tauri/src/lib.rs — Extended...
*
* instead of valid JSON arrays:
*
* "keyDecisions": ["Used Web Notification API..."],
* "keyFiles": ["src-tauri/src/lib.rs — Extended..."]
*
* This module detects and repairs such patterns before JSON.parse is called.
*
* @see https://github.com/singularity-forge/sf-run/issues/2660
*/
/**
* Detect whether a JSON string contains YAML-style bullet-list values
* (i.e. `"key": - item` instead of `"key": ["item"]`).
*/
export function hasYamlBulletLists(json: string): boolean {
// Match: "key": followed by whitespace then a dash-space pattern (YAML bullet)
// The negative lookahead excludes negative numbers (e.g. "key": -1)
return /"\s*:\s*-\s+(?!\d)/.test(json);
}
/**
* Detect whether a JSON string contains XML parameter tags
* (i.e. `<parameter name="X">value</parameter>`).
*
* Some models mix XML tool-call syntax into JSON string values,
* producing hybrid output that fails JSON.parse.
*
* @see https://github.com/singularity-forge/sf-run/issues/3403
*/
export function hasXmlParameterTags(json: string): boolean {
return /<\/?parameter[\s>]/.test(json);
}
/**
* Detect whether a JSON string contains truncated numeric values
* (e.g. `"exitCode": -,` or `"durationMs": ,`).
*
* Smaller models sometimes emit incomplete numbers when the value
* is cut off mid-generation.
*
* @see https://github.com/singularity-forge/sf-run/issues/3464
*/
export function hasTruncatedNumbers(json: string): boolean {
// Match: colon, optional whitespace, then a comma or } without a value
// Or: colon, optional whitespace, bare minus sign followed by comma/}
return /:\s*,/.test(json) || /:\s*-\s*[,}]/.test(json);
}
type XmlParameterBlock = {
name: string;
value: unknown;
};
const xmlParameterBlockPattern = /<parameter\s+name="([^"]+)"\s*>([\s\S]*?)<\/parameter>/g;
function parseXmlParameterValue(raw: string): unknown {
const trimmed = raw.trim();
if (trimmed === "") return "";
try {
return JSON.parse(trimmed);
} catch {
return trimmed;
}
}
function extractXmlParameterBlocks(text: string): XmlParameterBlock[] {
const blocks: XmlParameterBlock[] = [];
for (const match of text.matchAll(xmlParameterBlockPattern)) {
blocks.push({
name: match[1],
value: parseXmlParameterValue(match[2] ?? ""),
});
}
return blocks;
}
function trimLeakedXmlTail(fieldName: string, value: string): string {
let cut = value.length;
const parameterIndex = value.indexOf("<parameter");
if (parameterIndex >= 0) cut = Math.min(cut, parameterIndex);
const closingTagIndex = value.indexOf(`</${fieldName}>`);
if (closingTagIndex >= 0) cut = Math.min(cut, closingTagIndex);
return value.slice(0, cut).trimEnd();
}
/**
* Strip XML `<parameter>` tags from a JSON string, leaving only the
* text content. This handles the case where the LLM mixes XML
* tool-call format into JSON string values.
*/
function stripXmlParameterTags(json: string): string {
// Remove opening tags: <parameter name="X">
let cleaned = json.replace(/<parameter\s+name="[^"]*"\s*>/g, "");
// Remove closing tags: </parameter>
cleaned = cleaned.replace(/<\/parameter>/g, "");
return cleaned;
}
function promoteXmlParametersToTopLevel(json: string): string {
try {
const parsed = JSON.parse(json) as Record<string, unknown>;
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
return stripXmlParameterTags(json);
}
let changed = false;
for (const [fieldName, value] of Object.entries(parsed)) {
if (typeof value !== "string" || !hasXmlParameterTags(value)) continue;
const blocks = extractXmlParameterBlocks(value);
if (blocks.length === 0) continue;
parsed[fieldName] = trimLeakedXmlTail(fieldName, value);
for (const block of blocks) {
if (!(block.name in parsed)) {
parsed[block.name] = block.value;
}
}
changed = true;
}
return changed ? JSON.stringify(parsed) : stripXmlParameterTags(json);
} catch {
return stripXmlParameterTags(json);
}
}
/**
* Replace truncated numeric values with 0.
* Handles: `"key": ,` → `"key": 0,` and `"key": -,` → `"key": 0,`
*/
function repairTruncatedNumbers(json: string): string {
// Bare comma after colon (missing value entirely)
let repaired = json.replace(/:\s*,/g, ": 0,");
// Bare minus sign followed by comma or closing brace
repaired = repaired.replace(/:\s*-\s*([,}])/g, ": 0$1");
return repaired;
}
/**
* Attempt to repair malformed JSON in LLM tool-call arguments.
*
* Handles three categories of malformation:
*
* 1. **YAML bullet lists** (#2660): `"key": - item1\n - item2` → `"key": ["item1", "item2"]`
* 2. **XML parameter tags** (#3403): `<parameter name="X">value</parameter>` → stripped to content
* 3. **Truncated numbers** (#3464): `"exitCode": -,` → `"exitCode": 0,`
*
* Returns the original string unchanged if no patterns are detected
* or if the repair itself would produce invalid JSON.
*/
export function repairToolJson(json: string): string {
let repaired = json;
// Phase 1: Strip XML parameter tags
if (hasXmlParameterTags(repaired)) {
repaired = promoteXmlParametersToTopLevel(repaired);
}
// Phase 2: Repair truncated numbers
if (hasTruncatedNumbers(repaired)) {
repaired = repairTruncatedNumbers(repaired);
}
// Phase 3: Repair YAML bullet lists
if (!hasYamlBulletLists(repaired)) {
return repaired;
}
// Strategy: find each `"key": - item1\n - item2\n - item3` region and
// wrap items in a JSON array.
//
// We work on the raw string because the JSON is not parseable yet.
// The pattern we target:
// "someKey":\s*- item text (possibly multiline)
// optionally followed by more `- item` lines
// terminated by the next `"key":` or `}` or end of string.
// Match a key followed by YAML-style bullet list.
// Capture: (1) the key portion including colon, (2) the bullet-list body,
// (3) the separator (comma or empty) before the next key/bracket.
// The bullet list body ends at the next `"key":` or `}` or `]` or end of string.
const keyBulletPattern =
/("(?:[^"\\]|\\.)*"\s*:\s*)(- .+?)(,?\s*)(?="(?:[^"\\]|\\.)*"\s*:|[}\]]|$)/gs;
repaired = repaired.replace(
keyBulletPattern,
(_match, keyPart: string, bulletBody: string, separator: string) => {
// Split the bullet body into individual items on `- ` boundaries.
// Items may contain embedded newlines for multi-line values.
const items = bulletBody
.split(/\n?\s*- /)
.filter((s) => s.trim().length > 0)
.map((s) => s.replace(/,\s*$/, "").trim());
// JSON-encode each item as a string, then wrap in an array.
const jsonArray = "[" + items.map((item) => JSON.stringify(item)).join(", ") + "]";
// Re-emit the separator (comma) so the next key is properly delimited
const sep = separator.trim() ? separator : (/^\s*"/.test(separator + "x") ? ", " : "");
return keyPart + jsonArray + sep;
},
);
// Strip trailing commas before } or ] (common in repaired JSON)
repaired = repaired.replace(/,(\s*[}\]])/g, "$1");
return repaired;
}