singularity-forge/src/resources/extensions/sf/metrics-central.js
Mikael Hugo 05953e9599 fix(lint): restore 0 Biome diagnostics and fix web-mode-onboarding test timeout
- Remove/prefix unused imports and variables across 11 src/ files to clear
  74 diagnostics introduced by 37 subsequent commits since run #3
- Fix pre-existing timeout in web-mode-onboarding integration test:
  - Add timeoutMs: 120_000 to launchPackagedWebHost call (was unbounded)
  - Raise AbortSignal.timeout on simple fetches 10s → 30s (under parallel load)
  - Raise overall test timeout 180s → 420s (budget: 120+60+30+30+120+30=390s)
- Log autoresearch run #4 and update lessons in autoresearch.md

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-10 11:01:43 +02:00

1131 lines
30 KiB
JavaScript

/**
* Centralized Metrics Collector — Unified metrics sink for all SF subsystems.
*
* Purpose: Replace scattered metrics emission (DB, Prometheus, stderr, JSONL)
* with a single collector that aggregates counters, gauges, and histograms,
* then exposes them in Prometheus text format AND persists to SQLite for
* queryable historical analysis.
*
* Consumer: /uok status, health widgets, external Prometheus scrapers,
* TUI cost/context overlay, and programmatic queries via sf-db.
*
* Design:
* - In-memory aggregation with configurable flush interval
* - Prometheus text format output (compatible with existing exposition)
* - SQLite persistence for historical queries (session-scoped)
* - Cost/token metrics alongside operational metrics
* - Retry with exponential backoff on flush failures
* - Zero external dependencies
*/
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { DatabaseSync } from "node:sqlite";
import { sfRoot } from "./paths.js";
import { logWarning } from "./workflow-logger.js";
const FLUSH_INTERVAL_MS = 60_000; // 1 minute
const MAX_HISTOGRAM_BUCKETS = 10;
const FLUSH_RETRY_MAX = 3;
const FLUSH_RETRY_BASE_MS = 1000;
const METRIC_NAME_PATTERN = /^[a-zA-Z_:][a-zA-Z0-9_:]*$/;
const METRICS_DB_ROW_CAP = 10_000; // keep newest N rows; prune on flush when exceeded
// ─── Metrics System Performance Monitoring ──────────────────────────────────
let _metricsSystemStartTime = Date.now();
let _flushCount = 0;
let _flushSuccessCount = 0;
let _flushFailureCount = 0;
let _lastFlushDuration = 0;
let _lastFlushTimestamp = 0;
let _totalFlushDuration = 0;
/**
* Get metrics system performance stats.
*/
export function getMetricsSystemStats() {
const uptime = Date.now() - _metricsSystemStartTime;
return {
uptimeMs: uptime,
uptimeSeconds: Math.floor(uptime / 1000),
flushCount: _flushCount,
flushSuccessCount: _flushSuccessCount,
flushFailureCount: _flushFailureCount,
successRate:
_flushCount > 0
? `${((_flushSuccessCount / _flushCount) * 100).toFixed(1)}%`
: "0%",
lastFlushDuration: _lastFlushDuration,
lastFlushTimestamp: _lastFlushTimestamp,
averageFlushDuration:
_flushSuccessCount > 0
? Math.round(_totalFlushDuration / _flushSuccessCount)
: 0,
databaseStatus: _metricsDb ? "connected" : "disconnected",
};
}
/**
* Get system performance dashboard metrics.
* Returns a formatted summary of key performance indicators.
*/
export function getSystemPerformanceDashboard() {
const systemStats = getMetricsSystemStats();
const registry = getRegistry();
return {
uptime: systemStats.uptimeSeconds,
metricsSystemHealth: {
status: systemStats.databaseStatus,
successRate: systemStats.successRate,
flushCount: systemStats.flushCount,
averageFlushDuration: `${systemStats.averageFlushDuration}ms`,
},
cost: extractMetricValue(registry, "sf_cost_total"),
tokens: {
input: extractMetricValue(registry, "sf_tokens_input_total"),
output: extractMetricValue(registry, "sf_tokens_output_total"),
},
performance: {
averageToolExecution: extractMetricHistogramMean(
registry,
"sf_tool_execution_duration_ms",
),
averageModelRequest: extractMetricHistogramMean(
registry,
"sf_model_request_duration_ms",
),
averageDatabaseQuery: extractMetricHistogramMean(
registry,
"sf_database_query_duration_ms",
),
},
errors: {
tool: extractMetricValue(registry, "sf_tool_errors_total"),
model: extractMetricValue(registry, "sf_model_errors_total"),
database: extractMetricValue(registry, "sf_database_errors_total"),
system: extractMetricValue(registry, "sf_system_warnings_total"),
},
resources: {
activeSessions: extractMetricGaugeValue(
registry,
"sf_active_sessions_count",
),
activeAgents: extractMetricGaugeValue(registry, "sf_active_agents_count"),
concurrentToolCalls: extractMetricGaugeValue(
registry,
"sf_concurrent_tool_calls",
),
},
};
}
/**
* Extract a metric value from the registry.
*/
function extractMetricValue(registry, metricName) {
const metric = registry.counters.get(metricName);
if (!metric) return 0;
let total = 0;
for (const value of metric.values.values()) total += value;
return total;
}
/**
* Extract histogram mean value.
*/
function extractMetricHistogramMean(registry, metricName) {
const hist = registry.histograms.get(metricName);
if (!hist || hist.count === 0) return 0;
return Math.round(hist.sum / hist.count);
}
/**
* Extract gauge value.
*/
function extractMetricGaugeValue(registry, metricName) {
const gauge = registry.gauges.get(metricName);
if (!gauge || gauge.values.size === 0) return 0;
// For gauges, return the most recent value
const values = Array.from(gauge.values.values());
return values[values.length - 1] ?? 0;
}
// ─── Metric Types ───────────────────────────────────────────────────────────
class Counter {
constructor(name, help, labelNames = []) {
this.name = name;
this.help = help;
this.labelNames = labelNames;
this.values = new Map(); // key → number
}
inc(labels = {}, amount = 1) {
const key = this._key(labels);
this.values.set(key, (this.values.get(key) ?? 0) + amount);
}
get(labels = {}) {
return this.values.get(this._key(labels)) ?? 0;
}
_key(labels) {
return _buildLabelKey(labels);
}
*lines() {
yield `# HELP ${this.name} ${this.help}`;
yield `# TYPE ${this.name} counter`;
for (const [key, value] of this.values) {
const labels = _parseLabelKey(key);
yield fmtLine(this.name, value, labels);
}
}
}
class Gauge {
constructor(name, help, labelNames = []) {
this.name = name;
this.help = help;
this.labelNames = labelNames;
this.values = new Map();
}
set(labels = {}, value) {
const safe = Number.isFinite(value) ? value : 0;
this.values.set(this._key(labels), safe);
}
get(labels = {}) {
return this.values.get(this._key(labels)) ?? 0;
}
_key(labels) {
return _buildLabelKey(labels);
}
*lines() {
yield `# HELP ${this.name} ${this.help}`;
yield `# TYPE ${this.name} gauge`;
for (const [key, value] of this.values) {
const labels = _parseLabelKey(key);
yield fmtLine(this.name, value, labels);
}
}
}
class Histogram {
constructor(
name,
help,
buckets = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
) {
this.name = name;
this.help = help;
const capped = [...buckets]
.sort((a, b) => a - b)
.slice(0, MAX_HISTOGRAM_BUCKETS);
this.buckets = capped;
this.counts = new Map(); // bucket → count
this.sum = 0;
this.count = 0;
}
observe(value) {
this.sum += value;
this.count++;
for (const bucket of this.buckets) {
if (value <= bucket) {
this.counts.set(bucket, (this.counts.get(bucket) ?? 0) + 1);
}
}
}
*lines() {
yield `# HELP ${this.name} ${this.help}`;
yield `# TYPE ${this.name} histogram`;
for (const bucket of this.buckets) {
yield fmtLine(`${this.name}_bucket`, this.counts.get(bucket) ?? 0, {
le: String(bucket),
});
}
yield fmtLine(`${this.name}_bucket`, this.count, { le: "+Inf" });
yield fmtLine(`${this.name}_sum`, this.sum);
yield fmtLine(`${this.name}_count`, this.count);
}
}
// ─── Label Escaping ─────────────────────────────────────────────────────────
function _escapeLabel(v) {
return String(v)
.replace(/\\/g, "\\\\")
.replace(/=/g, "\\=")
.replace(/,/g, "\\,");
}
function _unescapeLabel(v) {
return v.replace(/\\,/g, ",").replace(/\\=/g, "=").replace(/\\\\/g, "\\");
}
// ─── Label Key Builder (escapes values, stable ordering) ────────────────────
function _buildLabelKey(labels) {
const keys = Object.keys(labels).sort();
return keys.map((k) => `${k}=${_escapeLabel(labels[k] ?? "")}`).join(",");
}
function _parseLabelKey(key) {
const labels = {};
let i = 0;
while (i < key.length) {
// Find the '=' separator for this label
const eqIdx = key.indexOf("=", i);
if (eqIdx === -1) break;
const k = key.slice(i, eqIdx);
// Parse the value, handling escapes
let v = "";
let j = eqIdx + 1;
while (j < key.length) {
const ch = key[j];
if (ch === "\\" && j + 1 < key.length) {
const next = key[j + 1];
if (next === "\\" || next === "=" || next === ",") {
v += next;
j += 2;
continue;
}
}
if (ch === ",") {
break;
}
v += ch;
j++;
}
labels[k] = v;
i = j + 1; // skip the ','
}
return labels;
}
// ─── Formatter ──────────────────────────────────────────────────────────────
function fmtLine(name, value, labels = {}) {
const labelStr = Object.entries(labels)
.map(([k, v]) => `${k}="${v}"`)
.join(",");
const suffix = labelStr ? `{${labelStr}}` : "";
return `${name}${suffix} ${value}`;
}
// ─── Validation ─────────────────────────────────────────────────────────────
function validateMetricName(name) {
if (!name || typeof name !== "string") {
throw new TypeError(
`Metric name must be a non-empty string, got: ${typeof name}`,
);
}
if (!METRIC_NAME_PATTERN.test(name)) {
throw new Error(
`Invalid metric name "${name}". Must match Prometheus naming convention: ` +
`^[a-zA-Z_:][a-zA-Z0-9_:]*$`,
);
}
}
// ─── Central Registry ───────────────────────────────────────────────────────
class MetricsRegistry {
counters = new Map();
gauges = new Map();
histograms = new Map();
_metadata = new Map();
counter(name, help, labelNames) {
if (!this.counters.has(name)) {
this.counters.set(name, new Counter(name, help, labelNames));
}
return this.counters.get(name);
}
gauge(name, help, labelNames) {
if (!this.gauges.has(name)) {
this.gauges.set(name, new Gauge(name, help, labelNames));
}
return this.gauges.get(name);
}
histogram(name, help, buckets) {
if (!this.histograms.has(name)) {
this.histograms.set(name, new Histogram(name, help, buckets));
}
return this.histograms.get(name);
}
buildText() {
const lines = [];
for (const c of this.counters.values()) {
lines.push(...c.lines());
}
for (const g of this.gauges.values()) {
lines.push(...g.lines());
}
for (const h of this.histograms.values()) {
lines.push(...h.lines());
}
return lines.join("\n") + "\n";
}
clear() {
this.counters.clear();
this.gauges.clear();
this.histograms.clear();
}
}
// ─── Singleton ──────────────────────────────────────────────────────────────
let _registry = null;
let _flushTimer = null;
let _metricsHealthTimer = null;
let _basePath = "";
let _sessionId = "";
let _dbAdapter = null; // kept for API compat but no longer used for metrics writes
let _metricsDb = null; // dedicated metrics.db connection
let _flushFailures = 0;
function getRegistry() {
if (!_registry) _registry = new MetricsRegistry();
return _registry;
}
function metricsFilePath(basePath) {
return join(sfRoot(basePath), "runtime", "sf-metrics.prom");
}
// ─── DB Persistence ─────────────────────────────────────────────────────────
function metricsDbPath(basePath) {
return join(sfRoot(basePath), "metrics.db");
}
function openMetricsDb(basePath) {
if (_metricsDb) return;
try {
mkdirSync(sfRoot(basePath), { recursive: true });
const db = new DatabaseSync(metricsDbPath(basePath));
db.exec("PRAGMA journal_mode=WAL");
db.exec("PRAGMA synchronous=NORMAL");
db.exec(`
CREATE TABLE IF NOT EXISTS metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
type TEXT NOT NULL CHECK(type IN ('counter', 'gauge', 'histogram')),
labels TEXT,
value REAL NOT NULL,
timestamp TEXT NOT NULL DEFAULT (datetime('now')),
session_id TEXT
)
`);
db.exec(`CREATE INDEX IF NOT EXISTS idx_metrics_name ON metrics(name)`);
db.exec(
`CREATE INDEX IF NOT EXISTS idx_metrics_session ON metrics(session_id)`,
);
db.exec(
`CREATE INDEX IF NOT EXISTS idx_metrics_name_ts ON metrics(name, timestamp DESC)`,
);
_metricsDb = db;
} catch (err) {
logWarning("metrics-central", `Failed to open metrics.db: ${err.message}`);
}
}
function closeMetricsDb() {
if (!_metricsDb) return;
try {
_metricsDb.close();
} catch {
// swallow
}
_metricsDb = null;
}
function _ensureMetricsTable(db) {
// no-op — metrics.db is set up by openMetricsDb
void db;
}
function persistMetricsToDb(registry, sessionId, _ignored) {
const db = _metricsDb;
if (!db) return;
const ts = new Date().toISOString();
function safeNum(n) {
return Number.isFinite(n) ? n : 0;
}
try {
const insert = db.prepare(
"INSERT INTO metrics (name, type, labels, value, timestamp, session_id) VALUES (?, ?, ?, ?, ?, ?)",
);
for (const c of registry.counters.values()) {
for (const [key, value] of c.values) {
const labels = _parseLabelKey(key);
insert.run(
c.name,
"counter",
JSON.stringify(labels),
safeNum(value),
ts,
sessionId,
);
}
}
for (const g of registry.gauges.values()) {
for (const [key, value] of g.values) {
const labels = _parseLabelKey(key);
insert.run(
g.name,
"gauge",
JSON.stringify(labels),
safeNum(value),
ts,
sessionId,
);
}
}
for (const h of registry.histograms.values()) {
insert.run(
h.name,
"histogram",
JSON.stringify({ count: h.count, sum: h.sum }),
safeNum(h.sum),
ts,
sessionId,
);
}
} catch (err) {
if (err.message?.includes("database is not open")) {
closeMetricsDb();
return;
}
logWarning("metrics-central", `DB persist failed: ${err.message}`);
}
// Prune if the table has grown beyond the cap (best-effort; never block flush)
try {
const row = _metricsDb?.prepare("SELECT count(*) as n FROM metrics").get();
if (row && row.n > METRICS_DB_ROW_CAP) {
_metricsDb
.prepare(
`DELETE FROM metrics WHERE rowid NOT IN (
SELECT rowid FROM metrics ORDER BY timestamp DESC LIMIT ${METRICS_DB_ROW_CAP}
)`,
)
.run();
}
} catch (_) {
// swallow — prune failure must never surface to the user
}
}
// ─── Flush with Retry ───────────────────────────────────────────────────────
function flushMetrics() {
if (!_basePath) return;
const flushStartTime = Date.now();
_flushCount++;
try {
const text = getRegistry().buildText();
const path = metricsFilePath(_basePath);
mkdirSync(join(sfRoot(_basePath), "runtime"), { recursive: true });
writeFileSync(path, text, "utf-8");
// Persist to dedicated metrics.db
persistMetricsToDb(getRegistry(), _sessionId, null);
// Update performance metrics
_flushSuccessCount++;
_lastFlushDuration = Date.now() - flushStartTime;
_lastFlushTimestamp = Date.now();
_totalFlushDuration += _lastFlushDuration;
_flushFailures = 0;
// Record flush performance metrics
try {
getRegistry()
.counter(
"sf_metrics_flush_success_total",
"Total successful metrics flushes",
[],
)
.inc({}, 1);
getRegistry()
.gauge(
"sf_metrics_flush_duration_ms",
"Duration of last metrics flush in milliseconds",
[],
)
.set({}, _lastFlushDuration);
} catch {
// Best effort - don't let metrics recording break the flush
}
} catch (err) {
_flushFailureCount++;
_flushFailures++;
logWarning(
"metrics-central",
`Flush failed (attempt ${_flushFailures}): ${err.message}`,
);
if (_flushFailures < FLUSH_RETRY_MAX) {
const delay = FLUSH_RETRY_BASE_MS * 2 ** (_flushFailures - 1);
setTimeout(flushMetrics, delay);
} else {
// Record flush failure as a metric
try {
getRegistry()
.counter(
"sf_metrics_flush_failed_total",
"Total metrics flush failures",
[],
)
.inc({}, 1);
} catch {
// Best effort
}
}
}
}
// ─── Public API ─────────────────────────────────────────────────────────────
/**
* Initialize the centralized metrics system.
*
* @param {string} basePath — project root
* @param {object} [opts] — { flushIntervalMs, sessionId, dbAdapter }
*/
export function initMetricsCentral(basePath, opts = {}) {
_basePath = basePath;
_sessionId = opts.sessionId ?? "";
_dbAdapter = opts.dbAdapter ?? null; // accepted but no longer used for metrics writes
const interval = opts.flushIntervalMs ?? FLUSH_INTERVAL_MS;
// Reset metrics system stats on fresh init
if (!_flushTimer) {
_metricsSystemStartTime = Date.now();
_flushCount = 0;
_flushSuccessCount = 0;
_flushFailureCount = 0;
_lastFlushDuration = 0;
_lastFlushTimestamp = 0;
_totalFlushDuration = 0;
}
if (_flushTimer) clearInterval(_flushTimer);
_flushTimer = setInterval(flushMetrics, interval);
// Ensure timer doesn't keep process alive
if (_flushTimer.unref) _flushTimer.unref();
// Open dedicated metrics.db (separate from main sf.db to avoid WAL pressure)
openMetricsDb(basePath);
// Start periodic metrics system health reporting
if (!_metricsHealthTimer) {
_metricsHealthTimer = setInterval(() => {
try {
updateMetricsSystemHealth();
} catch {
// Non-fatal
}
}, 300000); // Every 5 minutes
if (_metricsHealthTimer.unref) _metricsHealthTimer.unref();
}
}
/**
* Update metrics system health metrics.
*/
function updateMetricsSystemHealth() {
const registry = getRegistry();
try {
// Record system uptime
const uptime = Math.floor((Date.now() - _metricsSystemStartTime) / 1000);
registry
.gauge(
"sf_metrics_system_uptime_seconds",
"Metrics system uptime in seconds",
[],
)
.set({}, uptime);
// Record database status
registry
.gauge(
"sf_metrics_database_status",
"Database connection status (1=connected, 0=disconnected)",
["project_path"],
)
.set({ project_path: _basePath || "unknown" }, _metricsDb ? 1 : 0);
// Record in-memory metrics count
let totalMetrics = 0;
totalMetrics += registry.counters.size;
totalMetrics += registry.gauges.size;
totalMetrics += registry.histograms.size;
registry
.gauge(
"sf_metrics_active_count",
"Number of active metrics in memory",
[],
)
.set({}, totalMetrics);
} catch (err) {
logWarning(
"metrics-central",
`Failed to update health metrics: ${err.message}`,
);
}
}
/**
* Stop the metrics collector.
*/
export function stopMetricsCentral() {
if (_flushTimer) {
clearInterval(_flushTimer);
_flushTimer = null;
}
if (_metricsHealthTimer) {
clearInterval(_metricsHealthTimer);
_metricsHealthTimer = null;
}
// Final flush attempt
flushMetrics();
_basePath = "";
_sessionId = "";
_dbAdapter = null;
closeMetricsDb();
}
/**
* Record a counter increment.
*
* @param {string} name — metric name (sf_ prefix recommended)
* @param {object} [labels] — label key-value pairs
* @param {number} [amount] — increment amount (default 1)
*/
export function recordCounter(name, labels = {}, amount = 1) {
validateMetricName(name);
const meta = getMetricMeta(name);
// Inject session_id into labels if available
if (_sessionId && !labels.session_id) {
labels = { ...labels, session_id: _sessionId };
}
getRegistry()
.counter(name, meta.help, Object.keys(labels))
.inc(labels, amount);
}
/**
* Record a gauge value.
*
* @param {string} name — metric name
* @param {number} value — gauge value
* @param {object} [labels] — label key-value pairs
*/
export function recordGauge(name, value, labels = {}) {
validateMetricName(name);
const meta = getMetricMeta(name);
if (_sessionId && !labels.session_id) {
labels = { ...labels, session_id: _sessionId };
}
getRegistry().gauge(name, meta.help, Object.keys(labels)).set(labels, value);
}
/**
* Record a histogram observation.
*
* @param {string} name — metric name
* @param {number} value — observed value
*/
export function recordHistogram(name, value) {
validateMetricName(name);
const meta = getMetricMeta(name);
getRegistry().histogram(name, meta.help, meta.buckets).observe(value);
}
/**
* Record cost and token usage for a unit.
*
* @param {string} unitId — unit identifier
* @param {string} modelId — model identifier
* @param {number} inputTokens — input token count
* @param {number} outputTokens — output token count
* @param {number} cost — cost in USD
* @param {string} [workMode] — current work mode
*/
export function recordCost(
unitId,
modelId,
inputTokens,
outputTokens,
cost,
workMode = "",
) {
const labels = { unit_id: unitId, model_id: modelId };
if (workMode) labels.work_mode = workMode;
recordCounter("sf_cost_total", labels, cost);
recordCounter("sf_tokens_input_total", { model_id: modelId }, inputTokens);
recordCounter("sf_tokens_output_total", { model_id: modelId }, outputTokens);
recordGauge("sf_cost_last", cost, { unit_id: unitId, model_id: modelId });
}
/**
* Record tool execution performance.
*
* @param {string} toolName — name of the tool
* @param {number} durationMs — execution duration in milliseconds
* @param {boolean} [isError] — whether the execution resulted in an error
* @param {string} [errorType] — type of error if isError is true
*/
export function recordToolExecution(
toolName,
durationMs,
isError = false,
errorType = "",
) {
recordHistogram("sf_tool_execution_duration_ms", durationMs);
if (isError) {
recordCounter(
"sf_tool_errors_total",
{ tool_name: toolName, error_type: errorType || "unknown" },
1,
);
}
}
/**
* Record model request performance.
*
* @param {string} modelId — model identifier
* @param {number} durationMs — request duration in milliseconds
* @param {boolean} [isError] — whether the request resulted in an error
* @param {string} [errorType] — type of error if isError is true
*/
export function recordModelRequest(
modelId,
durationMs,
isError = false,
errorType = "",
) {
recordHistogram("sf_model_request_duration_ms", durationMs);
if (isError) {
recordCounter(
"sf_model_errors_total",
{ model_id: modelId, error_type: errorType || "unknown" },
1,
);
}
}
/**
* Record database operation performance.
*
* @param {string} operation — database operation name
* @param {number} durationMs — query duration in milliseconds
* @param {boolean} [isError] — whether the operation resulted in an error
* @param {string} [errorType] — type of error if isError is true
*/
export function recordDatabaseOperation(
operation,
durationMs,
isError = false,
errorType = "",
) {
recordHistogram("sf_database_query_duration_ms", durationMs);
if (isError) {
recordCounter(
"sf_database_errors_total",
{ operation, error_type: errorType || "unknown" },
1,
);
}
}
/**
* Record system warning.
*
* @param {string} component — system component that issued the warning
* @param {string} warningType — type of warning
*/
export function recordSystemWarning(component, warningType) {
recordCounter(
"sf_system_warnings_total",
{ component, warning_type: warningType },
1,
);
}
/**
* Update resource usage gauges.
*
* @param {object} resources — resource usage data
* @param {number} [resources.activeSessions] — number of active sessions
* @param {number} [resources.activeAgents] — number of active agents
* @param {number} [resources.concurrentToolCalls] — number of concurrent tool calls
*/
export function updateResourceGauges(resources = {}) {
if (resources.activeSessions !== undefined) {
recordGauge("sf_active_sessions_count", resources.activeSessions);
}
if (resources.activeAgents !== undefined) {
recordGauge("sf_active_agents_count", resources.activeAgents);
}
if (resources.concurrentToolCalls !== undefined) {
recordGauge("sf_concurrent_tool_calls", resources.concurrentToolCalls);
}
}
/**
* Get current metrics text in Prometheus format.
*/
export function getMetricsText() {
return getRegistry().buildText();
}
/**
* Read persisted metrics from disk.
*/
export function readMetricsFile(basePath) {
const path = metricsFilePath(basePath);
if (!existsSync(path)) return null;
try {
return readFileSync(path, "utf-8");
} catch {
return null;
}
}
/**
* Query metrics from DB for a session.
*
* @param {object} db — DB adapter
* @param {string} [sessionId] — session to filter by
* @param {string} [name] — metric name to filter by
* @param {number} [limit] — max rows to return
* @returns {Array} — metric rows
*/
export function queryMetrics(_db, sessionId = null, name = null, limit = 1000) {
if (!_metricsDb) return [];
try {
let sql = "SELECT * FROM metrics WHERE 1=1";
const params = [];
if (sessionId) {
sql += " AND session_id = ?";
params.push(sessionId);
}
if (name) {
sql += " AND name = ?";
params.push(name);
}
sql += " ORDER BY timestamp DESC LIMIT ?";
params.push(limit);
const stmt = _metricsDb.prepare(sql);
return stmt.all(...params);
} catch (err) {
logWarning("metrics-central", `Query failed: ${err.message}`);
return [];
}
}
// ─── Metric Metadata Registry ───────────────────────────────────────────────
const METRIC_META = {
// Subagent inheritance
sf_subagent_dispatch_total: {
help: "Total subagent dispatch attempts",
labels: ["work_mode", "permission_profile"],
},
sf_subagent_dispatch_blocked: {
help: "Subagent dispatches blocked by inheritance policy",
labels: ["reason", "work_mode", "permission_profile"],
},
sf_subagent_dispatch_allowed: {
help: "Subagent dispatches allowed after inheritance check",
labels: ["work_mode", "permission_profile"],
},
// Mode transitions
sf_mode_transition_total: {
help: "Total mode transitions",
labels: ["axis", "from", "to", "reason"],
},
// Task frontmatter
sf_task_created_total: {
help: "Total tasks created with frontmatter",
labels: ["risk_level", "mutation_scope"],
},
sf_task_parallel_blocked: {
help: "Tasks blocked from parallel execution by frontmatter",
labels: ["reason"],
},
// Parallel intent
sf_parallel_intent_declared: {
help: "Parallel worker intents declared",
labels: ["milestone_id"],
},
sf_parallel_intent_conflict: {
help: "Parallel intent conflicts detected",
labels: ["milestone_id"],
},
// Remote steering
sf_remote_steering_applied: {
help: "Remote steering directives applied",
labels: ["directive_type", "source"],
},
sf_remote_steering_rejected: {
help: "Remote steering directives rejected (throttle/invalid)",
labels: ["reason"],
},
// Skill eval
sf_skill_eval_runs_total: {
help: "Total skill evaluation runs",
labels: ["skill_name", "passed"],
},
sf_skill_eval_duration_ms: {
help: "Skill evaluation duration in milliseconds",
buckets: [100, 500, 1000, 5000, 10000, 30000],
},
// Cost guard
sf_cost_guard_blocked: {
help: "Units blocked by cost guard",
labels: ["reason", "model_id"],
},
sf_cost_guard_hourly_spend: {
help: "Current hourly spend in USD",
},
// Gate runner
sf_gate_runs_total: {
help: "Total gate executions",
labels: ["gate_id", "outcome"],
},
sf_gate_latency_ms: {
help: "Gate execution latency in milliseconds",
buckets: [10, 50, 100, 250, 500, 1000, 2500, 5000],
},
// Message bus
sf_message_bus_messages_total: {
help: "Total messages in bus",
labels: ["agent_id"],
},
sf_message_bus_unread_total: {
help: "Unread messages in bus",
labels: ["agent_id"],
},
// Cost tracking
sf_cost_total: {
help: "Total cost in USD",
labels: ["unit_id", "model_id", "work_mode"],
},
sf_tokens_input_total: {
help: "Total input tokens",
labels: ["model_id"],
},
sf_tokens_output_total: {
help: "Total output tokens",
labels: ["model_id"],
},
sf_cost_last: {
help: "Last recorded cost in USD",
labels: ["unit_id", "model_id"],
},
// Performance tracking
sf_session_start_duration_ms: {
help: "Session start duration in milliseconds",
buckets: [100, 250, 500, 1000, 2000, 5000],
},
sf_tool_execution_duration_ms: {
help: "Tool execution duration in milliseconds",
buckets: [10, 50, 100, 250, 500, 1000, 2500, 5000, 10000],
},
sf_model_request_duration_ms: {
help: "Model request duration in milliseconds",
buckets: [100, 500, 1000, 2500, 5000, 10000, 30000, 60000],
},
sf_database_query_duration_ms: {
help: "Database query duration in milliseconds",
buckets: [1, 5, 10, 25, 50, 100, 250, 500],
},
// Resource usage
sf_active_sessions_count: {
help: "Number of active sessions",
},
sf_active_agents_count: {
help: "Number of active agents",
},
sf_concurrent_tool_calls: {
help: "Number of concurrent tool calls",
},
// Error tracking
sf_tool_errors_total: {
help: "Total tool execution errors",
labels: ["tool_name", "error_type"],
},
sf_model_errors_total: {
help: "Total model request errors",
labels: ["model_id", "error_type"],
},
sf_database_errors_total: {
help: "Total database operation errors",
labels: ["operation", "error_type"],
},
sf_system_warnings_total: {
help: "Total system warnings",
labels: ["component", "warning_type"],
},
// Internal
sf_metrics_flush_failed_total: {
help: "Total metrics flush failures",
},
sf_metrics_flush_success_total: {
help: "Total successful metrics flushes",
},
sf_metrics_flush_duration_ms: {
help: "Duration of last metrics flush in milliseconds",
},
sf_metrics_system_uptime_seconds: {
help: "Metrics system uptime in seconds",
},
sf_metrics_database_status: {
help: "Database connection status (1=connected, 0=disconnected)",
labels: ["project_path"],
},
};
function getMetricMeta(name) {
return METRIC_META[name] ?? { help: name, labels: [] };
}
/**
* Register custom metric metadata.
*/
export function registerMetricMeta(name, help, labels = [], buckets) {
METRIC_META[name] = { help, labels, buckets };
}