- Remove/prefix unused imports and variables across 11 src/ files to clear 74 diagnostics introduced by 37 subsequent commits since run #3 - Fix pre-existing timeout in web-mode-onboarding integration test: - Add timeoutMs: 120_000 to launchPackagedWebHost call (was unbounded) - Raise AbortSignal.timeout on simple fetches 10s → 30s (under parallel load) - Raise overall test timeout 180s → 420s (budget: 120+60+30+30+120+30=390s) - Log autoresearch run #4 and update lessons in autoresearch.md Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1131 lines
30 KiB
JavaScript
1131 lines
30 KiB
JavaScript
/**
|
|
* Centralized Metrics Collector — Unified metrics sink for all SF subsystems.
|
|
*
|
|
* Purpose: Replace scattered metrics emission (DB, Prometheus, stderr, JSONL)
|
|
* with a single collector that aggregates counters, gauges, and histograms,
|
|
* then exposes them in Prometheus text format AND persists to SQLite for
|
|
* queryable historical analysis.
|
|
*
|
|
* Consumer: /uok status, health widgets, external Prometheus scrapers,
|
|
* TUI cost/context overlay, and programmatic queries via sf-db.
|
|
*
|
|
* Design:
|
|
* - In-memory aggregation with configurable flush interval
|
|
* - Prometheus text format output (compatible with existing exposition)
|
|
* - SQLite persistence for historical queries (session-scoped)
|
|
* - Cost/token metrics alongside operational metrics
|
|
* - Retry with exponential backoff on flush failures
|
|
* - Zero external dependencies
|
|
*/
|
|
|
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
import { join } from "node:path";
|
|
import { DatabaseSync } from "node:sqlite";
|
|
import { sfRoot } from "./paths.js";
|
|
import { logWarning } from "./workflow-logger.js";
|
|
|
|
const FLUSH_INTERVAL_MS = 60_000; // 1 minute
|
|
const MAX_HISTOGRAM_BUCKETS = 10;
|
|
const FLUSH_RETRY_MAX = 3;
|
|
const FLUSH_RETRY_BASE_MS = 1000;
|
|
const METRIC_NAME_PATTERN = /^[a-zA-Z_:][a-zA-Z0-9_:]*$/;
|
|
const METRICS_DB_ROW_CAP = 10_000; // keep newest N rows; prune on flush when exceeded
|
|
|
|
// ─── Metrics System Performance Monitoring ──────────────────────────────────
|
|
|
|
let _metricsSystemStartTime = Date.now();
|
|
let _flushCount = 0;
|
|
let _flushSuccessCount = 0;
|
|
let _flushFailureCount = 0;
|
|
let _lastFlushDuration = 0;
|
|
let _lastFlushTimestamp = 0;
|
|
let _totalFlushDuration = 0;
|
|
|
|
/**
|
|
* Get metrics system performance stats.
|
|
*/
|
|
export function getMetricsSystemStats() {
|
|
const uptime = Date.now() - _metricsSystemStartTime;
|
|
return {
|
|
uptimeMs: uptime,
|
|
uptimeSeconds: Math.floor(uptime / 1000),
|
|
flushCount: _flushCount,
|
|
flushSuccessCount: _flushSuccessCount,
|
|
flushFailureCount: _flushFailureCount,
|
|
successRate:
|
|
_flushCount > 0
|
|
? `${((_flushSuccessCount / _flushCount) * 100).toFixed(1)}%`
|
|
: "0%",
|
|
lastFlushDuration: _lastFlushDuration,
|
|
lastFlushTimestamp: _lastFlushTimestamp,
|
|
averageFlushDuration:
|
|
_flushSuccessCount > 0
|
|
? Math.round(_totalFlushDuration / _flushSuccessCount)
|
|
: 0,
|
|
databaseStatus: _metricsDb ? "connected" : "disconnected",
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get system performance dashboard metrics.
|
|
* Returns a formatted summary of key performance indicators.
|
|
*/
|
|
export function getSystemPerformanceDashboard() {
|
|
const systemStats = getMetricsSystemStats();
|
|
const registry = getRegistry();
|
|
|
|
return {
|
|
uptime: systemStats.uptimeSeconds,
|
|
metricsSystemHealth: {
|
|
status: systemStats.databaseStatus,
|
|
successRate: systemStats.successRate,
|
|
flushCount: systemStats.flushCount,
|
|
averageFlushDuration: `${systemStats.averageFlushDuration}ms`,
|
|
},
|
|
cost: extractMetricValue(registry, "sf_cost_total"),
|
|
tokens: {
|
|
input: extractMetricValue(registry, "sf_tokens_input_total"),
|
|
output: extractMetricValue(registry, "sf_tokens_output_total"),
|
|
},
|
|
performance: {
|
|
averageToolExecution: extractMetricHistogramMean(
|
|
registry,
|
|
"sf_tool_execution_duration_ms",
|
|
),
|
|
averageModelRequest: extractMetricHistogramMean(
|
|
registry,
|
|
"sf_model_request_duration_ms",
|
|
),
|
|
averageDatabaseQuery: extractMetricHistogramMean(
|
|
registry,
|
|
"sf_database_query_duration_ms",
|
|
),
|
|
},
|
|
errors: {
|
|
tool: extractMetricValue(registry, "sf_tool_errors_total"),
|
|
model: extractMetricValue(registry, "sf_model_errors_total"),
|
|
database: extractMetricValue(registry, "sf_database_errors_total"),
|
|
system: extractMetricValue(registry, "sf_system_warnings_total"),
|
|
},
|
|
resources: {
|
|
activeSessions: extractMetricGaugeValue(
|
|
registry,
|
|
"sf_active_sessions_count",
|
|
),
|
|
activeAgents: extractMetricGaugeValue(registry, "sf_active_agents_count"),
|
|
concurrentToolCalls: extractMetricGaugeValue(
|
|
registry,
|
|
"sf_concurrent_tool_calls",
|
|
),
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract a metric value from the registry.
|
|
*/
|
|
function extractMetricValue(registry, metricName) {
|
|
const metric = registry.counters.get(metricName);
|
|
if (!metric) return 0;
|
|
let total = 0;
|
|
for (const value of metric.values.values()) total += value;
|
|
return total;
|
|
}
|
|
|
|
/**
|
|
* Extract histogram mean value.
|
|
*/
|
|
function extractMetricHistogramMean(registry, metricName) {
|
|
const hist = registry.histograms.get(metricName);
|
|
if (!hist || hist.count === 0) return 0;
|
|
return Math.round(hist.sum / hist.count);
|
|
}
|
|
|
|
/**
|
|
* Extract gauge value.
|
|
*/
|
|
function extractMetricGaugeValue(registry, metricName) {
|
|
const gauge = registry.gauges.get(metricName);
|
|
if (!gauge || gauge.values.size === 0) return 0;
|
|
|
|
// For gauges, return the most recent value
|
|
const values = Array.from(gauge.values.values());
|
|
return values[values.length - 1] ?? 0;
|
|
}
|
|
|
|
// ─── Metric Types ───────────────────────────────────────────────────────────
|
|
|
|
class Counter {
|
|
constructor(name, help, labelNames = []) {
|
|
this.name = name;
|
|
this.help = help;
|
|
this.labelNames = labelNames;
|
|
this.values = new Map(); // key → number
|
|
}
|
|
|
|
inc(labels = {}, amount = 1) {
|
|
const key = this._key(labels);
|
|
this.values.set(key, (this.values.get(key) ?? 0) + amount);
|
|
}
|
|
|
|
get(labels = {}) {
|
|
return this.values.get(this._key(labels)) ?? 0;
|
|
}
|
|
|
|
_key(labels) {
|
|
return _buildLabelKey(labels);
|
|
}
|
|
|
|
*lines() {
|
|
yield `# HELP ${this.name} ${this.help}`;
|
|
yield `# TYPE ${this.name} counter`;
|
|
for (const [key, value] of this.values) {
|
|
const labels = _parseLabelKey(key);
|
|
yield fmtLine(this.name, value, labels);
|
|
}
|
|
}
|
|
}
|
|
|
|
class Gauge {
|
|
constructor(name, help, labelNames = []) {
|
|
this.name = name;
|
|
this.help = help;
|
|
this.labelNames = labelNames;
|
|
this.values = new Map();
|
|
}
|
|
|
|
set(labels = {}, value) {
|
|
const safe = Number.isFinite(value) ? value : 0;
|
|
this.values.set(this._key(labels), safe);
|
|
}
|
|
|
|
get(labels = {}) {
|
|
return this.values.get(this._key(labels)) ?? 0;
|
|
}
|
|
|
|
_key(labels) {
|
|
return _buildLabelKey(labels);
|
|
}
|
|
|
|
*lines() {
|
|
yield `# HELP ${this.name} ${this.help}`;
|
|
yield `# TYPE ${this.name} gauge`;
|
|
for (const [key, value] of this.values) {
|
|
const labels = _parseLabelKey(key);
|
|
yield fmtLine(this.name, value, labels);
|
|
}
|
|
}
|
|
}
|
|
|
|
class Histogram {
|
|
constructor(
|
|
name,
|
|
help,
|
|
buckets = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
|
) {
|
|
this.name = name;
|
|
this.help = help;
|
|
const capped = [...buckets]
|
|
.sort((a, b) => a - b)
|
|
.slice(0, MAX_HISTOGRAM_BUCKETS);
|
|
this.buckets = capped;
|
|
this.counts = new Map(); // bucket → count
|
|
this.sum = 0;
|
|
this.count = 0;
|
|
}
|
|
|
|
observe(value) {
|
|
this.sum += value;
|
|
this.count++;
|
|
for (const bucket of this.buckets) {
|
|
if (value <= bucket) {
|
|
this.counts.set(bucket, (this.counts.get(bucket) ?? 0) + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
*lines() {
|
|
yield `# HELP ${this.name} ${this.help}`;
|
|
yield `# TYPE ${this.name} histogram`;
|
|
for (const bucket of this.buckets) {
|
|
yield fmtLine(`${this.name}_bucket`, this.counts.get(bucket) ?? 0, {
|
|
le: String(bucket),
|
|
});
|
|
}
|
|
yield fmtLine(`${this.name}_bucket`, this.count, { le: "+Inf" });
|
|
yield fmtLine(`${this.name}_sum`, this.sum);
|
|
yield fmtLine(`${this.name}_count`, this.count);
|
|
}
|
|
}
|
|
|
|
// ─── Label Escaping ─────────────────────────────────────────────────────────
|
|
|
|
function _escapeLabel(v) {
|
|
return String(v)
|
|
.replace(/\\/g, "\\\\")
|
|
.replace(/=/g, "\\=")
|
|
.replace(/,/g, "\\,");
|
|
}
|
|
|
|
function _unescapeLabel(v) {
|
|
return v.replace(/\\,/g, ",").replace(/\\=/g, "=").replace(/\\\\/g, "\\");
|
|
}
|
|
|
|
// ─── Label Key Builder (escapes values, stable ordering) ────────────────────
|
|
|
|
function _buildLabelKey(labels) {
|
|
const keys = Object.keys(labels).sort();
|
|
return keys.map((k) => `${k}=${_escapeLabel(labels[k] ?? "")}`).join(",");
|
|
}
|
|
|
|
function _parseLabelKey(key) {
|
|
const labels = {};
|
|
let i = 0;
|
|
while (i < key.length) {
|
|
// Find the '=' separator for this label
|
|
const eqIdx = key.indexOf("=", i);
|
|
if (eqIdx === -1) break;
|
|
const k = key.slice(i, eqIdx);
|
|
// Parse the value, handling escapes
|
|
let v = "";
|
|
let j = eqIdx + 1;
|
|
while (j < key.length) {
|
|
const ch = key[j];
|
|
if (ch === "\\" && j + 1 < key.length) {
|
|
const next = key[j + 1];
|
|
if (next === "\\" || next === "=" || next === ",") {
|
|
v += next;
|
|
j += 2;
|
|
continue;
|
|
}
|
|
}
|
|
if (ch === ",") {
|
|
break;
|
|
}
|
|
v += ch;
|
|
j++;
|
|
}
|
|
labels[k] = v;
|
|
i = j + 1; // skip the ','
|
|
}
|
|
return labels;
|
|
}
|
|
|
|
// ─── Formatter ──────────────────────────────────────────────────────────────
|
|
|
|
function fmtLine(name, value, labels = {}) {
|
|
const labelStr = Object.entries(labels)
|
|
.map(([k, v]) => `${k}="${v}"`)
|
|
.join(",");
|
|
const suffix = labelStr ? `{${labelStr}}` : "";
|
|
return `${name}${suffix} ${value}`;
|
|
}
|
|
|
|
// ─── Validation ─────────────────────────────────────────────────────────────
|
|
|
|
function validateMetricName(name) {
|
|
if (!name || typeof name !== "string") {
|
|
throw new TypeError(
|
|
`Metric name must be a non-empty string, got: ${typeof name}`,
|
|
);
|
|
}
|
|
if (!METRIC_NAME_PATTERN.test(name)) {
|
|
throw new Error(
|
|
`Invalid metric name "${name}". Must match Prometheus naming convention: ` +
|
|
`^[a-zA-Z_:][a-zA-Z0-9_:]*$`,
|
|
);
|
|
}
|
|
}
|
|
|
|
// ─── Central Registry ───────────────────────────────────────────────────────
|
|
|
|
class MetricsRegistry {
|
|
counters = new Map();
|
|
gauges = new Map();
|
|
histograms = new Map();
|
|
_metadata = new Map();
|
|
|
|
counter(name, help, labelNames) {
|
|
if (!this.counters.has(name)) {
|
|
this.counters.set(name, new Counter(name, help, labelNames));
|
|
}
|
|
return this.counters.get(name);
|
|
}
|
|
|
|
gauge(name, help, labelNames) {
|
|
if (!this.gauges.has(name)) {
|
|
this.gauges.set(name, new Gauge(name, help, labelNames));
|
|
}
|
|
return this.gauges.get(name);
|
|
}
|
|
|
|
histogram(name, help, buckets) {
|
|
if (!this.histograms.has(name)) {
|
|
this.histograms.set(name, new Histogram(name, help, buckets));
|
|
}
|
|
return this.histograms.get(name);
|
|
}
|
|
|
|
buildText() {
|
|
const lines = [];
|
|
for (const c of this.counters.values()) {
|
|
lines.push(...c.lines());
|
|
}
|
|
for (const g of this.gauges.values()) {
|
|
lines.push(...g.lines());
|
|
}
|
|
for (const h of this.histograms.values()) {
|
|
lines.push(...h.lines());
|
|
}
|
|
return lines.join("\n") + "\n";
|
|
}
|
|
|
|
clear() {
|
|
this.counters.clear();
|
|
this.gauges.clear();
|
|
this.histograms.clear();
|
|
}
|
|
}
|
|
|
|
// ─── Singleton ──────────────────────────────────────────────────────────────
|
|
|
|
let _registry = null;
|
|
let _flushTimer = null;
|
|
let _metricsHealthTimer = null;
|
|
let _basePath = "";
|
|
let _sessionId = "";
|
|
let _dbAdapter = null; // kept for API compat but no longer used for metrics writes
|
|
let _metricsDb = null; // dedicated metrics.db connection
|
|
let _flushFailures = 0;
|
|
|
|
function getRegistry() {
|
|
if (!_registry) _registry = new MetricsRegistry();
|
|
return _registry;
|
|
}
|
|
|
|
function metricsFilePath(basePath) {
|
|
return join(sfRoot(basePath), "runtime", "sf-metrics.prom");
|
|
}
|
|
|
|
// ─── DB Persistence ─────────────────────────────────────────────────────────
|
|
|
|
function metricsDbPath(basePath) {
|
|
return join(sfRoot(basePath), "metrics.db");
|
|
}
|
|
|
|
function openMetricsDb(basePath) {
|
|
if (_metricsDb) return;
|
|
try {
|
|
mkdirSync(sfRoot(basePath), { recursive: true });
|
|
const db = new DatabaseSync(metricsDbPath(basePath));
|
|
db.exec("PRAGMA journal_mode=WAL");
|
|
db.exec("PRAGMA synchronous=NORMAL");
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS metrics (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT NOT NULL,
|
|
type TEXT NOT NULL CHECK(type IN ('counter', 'gauge', 'histogram')),
|
|
labels TEXT,
|
|
value REAL NOT NULL,
|
|
timestamp TEXT NOT NULL DEFAULT (datetime('now')),
|
|
session_id TEXT
|
|
)
|
|
`);
|
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_metrics_name ON metrics(name)`);
|
|
db.exec(
|
|
`CREATE INDEX IF NOT EXISTS idx_metrics_session ON metrics(session_id)`,
|
|
);
|
|
db.exec(
|
|
`CREATE INDEX IF NOT EXISTS idx_metrics_name_ts ON metrics(name, timestamp DESC)`,
|
|
);
|
|
_metricsDb = db;
|
|
} catch (err) {
|
|
logWarning("metrics-central", `Failed to open metrics.db: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
function closeMetricsDb() {
|
|
if (!_metricsDb) return;
|
|
try {
|
|
_metricsDb.close();
|
|
} catch {
|
|
// swallow
|
|
}
|
|
_metricsDb = null;
|
|
}
|
|
|
|
function _ensureMetricsTable(db) {
|
|
// no-op — metrics.db is set up by openMetricsDb
|
|
void db;
|
|
}
|
|
|
|
function persistMetricsToDb(registry, sessionId, _ignored) {
|
|
const db = _metricsDb;
|
|
if (!db) return;
|
|
const ts = new Date().toISOString();
|
|
function safeNum(n) {
|
|
return Number.isFinite(n) ? n : 0;
|
|
}
|
|
try {
|
|
const insert = db.prepare(
|
|
"INSERT INTO metrics (name, type, labels, value, timestamp, session_id) VALUES (?, ?, ?, ?, ?, ?)",
|
|
);
|
|
for (const c of registry.counters.values()) {
|
|
for (const [key, value] of c.values) {
|
|
const labels = _parseLabelKey(key);
|
|
insert.run(
|
|
c.name,
|
|
"counter",
|
|
JSON.stringify(labels),
|
|
safeNum(value),
|
|
ts,
|
|
sessionId,
|
|
);
|
|
}
|
|
}
|
|
for (const g of registry.gauges.values()) {
|
|
for (const [key, value] of g.values) {
|
|
const labels = _parseLabelKey(key);
|
|
insert.run(
|
|
g.name,
|
|
"gauge",
|
|
JSON.stringify(labels),
|
|
safeNum(value),
|
|
ts,
|
|
sessionId,
|
|
);
|
|
}
|
|
}
|
|
for (const h of registry.histograms.values()) {
|
|
insert.run(
|
|
h.name,
|
|
"histogram",
|
|
JSON.stringify({ count: h.count, sum: h.sum }),
|
|
safeNum(h.sum),
|
|
ts,
|
|
sessionId,
|
|
);
|
|
}
|
|
} catch (err) {
|
|
if (err.message?.includes("database is not open")) {
|
|
closeMetricsDb();
|
|
return;
|
|
}
|
|
logWarning("metrics-central", `DB persist failed: ${err.message}`);
|
|
}
|
|
// Prune if the table has grown beyond the cap (best-effort; never block flush)
|
|
try {
|
|
const row = _metricsDb?.prepare("SELECT count(*) as n FROM metrics").get();
|
|
if (row && row.n > METRICS_DB_ROW_CAP) {
|
|
_metricsDb
|
|
.prepare(
|
|
`DELETE FROM metrics WHERE rowid NOT IN (
|
|
SELECT rowid FROM metrics ORDER BY timestamp DESC LIMIT ${METRICS_DB_ROW_CAP}
|
|
)`,
|
|
)
|
|
.run();
|
|
}
|
|
} catch (_) {
|
|
// swallow — prune failure must never surface to the user
|
|
}
|
|
}
|
|
|
|
// ─── Flush with Retry ───────────────────────────────────────────────────────
|
|
|
|
function flushMetrics() {
|
|
if (!_basePath) return;
|
|
|
|
const flushStartTime = Date.now();
|
|
_flushCount++;
|
|
|
|
try {
|
|
const text = getRegistry().buildText();
|
|
const path = metricsFilePath(_basePath);
|
|
mkdirSync(join(sfRoot(_basePath), "runtime"), { recursive: true });
|
|
writeFileSync(path, text, "utf-8");
|
|
// Persist to dedicated metrics.db
|
|
persistMetricsToDb(getRegistry(), _sessionId, null);
|
|
|
|
// Update performance metrics
|
|
_flushSuccessCount++;
|
|
_lastFlushDuration = Date.now() - flushStartTime;
|
|
_lastFlushTimestamp = Date.now();
|
|
_totalFlushDuration += _lastFlushDuration;
|
|
_flushFailures = 0;
|
|
|
|
// Record flush performance metrics
|
|
try {
|
|
getRegistry()
|
|
.counter(
|
|
"sf_metrics_flush_success_total",
|
|
"Total successful metrics flushes",
|
|
[],
|
|
)
|
|
.inc({}, 1);
|
|
getRegistry()
|
|
.gauge(
|
|
"sf_metrics_flush_duration_ms",
|
|
"Duration of last metrics flush in milliseconds",
|
|
[],
|
|
)
|
|
.set({}, _lastFlushDuration);
|
|
} catch {
|
|
// Best effort - don't let metrics recording break the flush
|
|
}
|
|
} catch (err) {
|
|
_flushFailureCount++;
|
|
_flushFailures++;
|
|
logWarning(
|
|
"metrics-central",
|
|
`Flush failed (attempt ${_flushFailures}): ${err.message}`,
|
|
);
|
|
if (_flushFailures < FLUSH_RETRY_MAX) {
|
|
const delay = FLUSH_RETRY_BASE_MS * 2 ** (_flushFailures - 1);
|
|
setTimeout(flushMetrics, delay);
|
|
} else {
|
|
// Record flush failure as a metric
|
|
try {
|
|
getRegistry()
|
|
.counter(
|
|
"sf_metrics_flush_failed_total",
|
|
"Total metrics flush failures",
|
|
[],
|
|
)
|
|
.inc({}, 1);
|
|
} catch {
|
|
// Best effort
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ─── Public API ─────────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Initialize the centralized metrics system.
|
|
*
|
|
* @param {string} basePath — project root
|
|
* @param {object} [opts] — { flushIntervalMs, sessionId, dbAdapter }
|
|
*/
|
|
export function initMetricsCentral(basePath, opts = {}) {
|
|
_basePath = basePath;
|
|
_sessionId = opts.sessionId ?? "";
|
|
_dbAdapter = opts.dbAdapter ?? null; // accepted but no longer used for metrics writes
|
|
const interval = opts.flushIntervalMs ?? FLUSH_INTERVAL_MS;
|
|
|
|
// Reset metrics system stats on fresh init
|
|
if (!_flushTimer) {
|
|
_metricsSystemStartTime = Date.now();
|
|
_flushCount = 0;
|
|
_flushSuccessCount = 0;
|
|
_flushFailureCount = 0;
|
|
_lastFlushDuration = 0;
|
|
_lastFlushTimestamp = 0;
|
|
_totalFlushDuration = 0;
|
|
}
|
|
|
|
if (_flushTimer) clearInterval(_flushTimer);
|
|
_flushTimer = setInterval(flushMetrics, interval);
|
|
|
|
// Ensure timer doesn't keep process alive
|
|
if (_flushTimer.unref) _flushTimer.unref();
|
|
|
|
// Open dedicated metrics.db (separate from main sf.db to avoid WAL pressure)
|
|
openMetricsDb(basePath);
|
|
|
|
// Start periodic metrics system health reporting
|
|
if (!_metricsHealthTimer) {
|
|
_metricsHealthTimer = setInterval(() => {
|
|
try {
|
|
updateMetricsSystemHealth();
|
|
} catch {
|
|
// Non-fatal
|
|
}
|
|
}, 300000); // Every 5 minutes
|
|
if (_metricsHealthTimer.unref) _metricsHealthTimer.unref();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Update metrics system health metrics.
|
|
*/
|
|
function updateMetricsSystemHealth() {
|
|
const registry = getRegistry();
|
|
try {
|
|
// Record system uptime
|
|
const uptime = Math.floor((Date.now() - _metricsSystemStartTime) / 1000);
|
|
registry
|
|
.gauge(
|
|
"sf_metrics_system_uptime_seconds",
|
|
"Metrics system uptime in seconds",
|
|
[],
|
|
)
|
|
.set({}, uptime);
|
|
|
|
// Record database status
|
|
registry
|
|
.gauge(
|
|
"sf_metrics_database_status",
|
|
"Database connection status (1=connected, 0=disconnected)",
|
|
["project_path"],
|
|
)
|
|
.set({ project_path: _basePath || "unknown" }, _metricsDb ? 1 : 0);
|
|
|
|
// Record in-memory metrics count
|
|
let totalMetrics = 0;
|
|
totalMetrics += registry.counters.size;
|
|
totalMetrics += registry.gauges.size;
|
|
totalMetrics += registry.histograms.size;
|
|
|
|
registry
|
|
.gauge(
|
|
"sf_metrics_active_count",
|
|
"Number of active metrics in memory",
|
|
[],
|
|
)
|
|
.set({}, totalMetrics);
|
|
} catch (err) {
|
|
logWarning(
|
|
"metrics-central",
|
|
`Failed to update health metrics: ${err.message}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Stop the metrics collector.
|
|
*/
|
|
export function stopMetricsCentral() {
|
|
if (_flushTimer) {
|
|
clearInterval(_flushTimer);
|
|
_flushTimer = null;
|
|
}
|
|
if (_metricsHealthTimer) {
|
|
clearInterval(_metricsHealthTimer);
|
|
_metricsHealthTimer = null;
|
|
}
|
|
// Final flush attempt
|
|
flushMetrics();
|
|
_basePath = "";
|
|
_sessionId = "";
|
|
_dbAdapter = null;
|
|
closeMetricsDb();
|
|
}
|
|
|
|
/**
|
|
* Record a counter increment.
|
|
*
|
|
* @param {string} name — metric name (sf_ prefix recommended)
|
|
* @param {object} [labels] — label key-value pairs
|
|
* @param {number} [amount] — increment amount (default 1)
|
|
*/
|
|
export function recordCounter(name, labels = {}, amount = 1) {
|
|
validateMetricName(name);
|
|
const meta = getMetricMeta(name);
|
|
// Inject session_id into labels if available
|
|
if (_sessionId && !labels.session_id) {
|
|
labels = { ...labels, session_id: _sessionId };
|
|
}
|
|
getRegistry()
|
|
.counter(name, meta.help, Object.keys(labels))
|
|
.inc(labels, amount);
|
|
}
|
|
|
|
/**
|
|
* Record a gauge value.
|
|
*
|
|
* @param {string} name — metric name
|
|
* @param {number} value — gauge value
|
|
* @param {object} [labels] — label key-value pairs
|
|
*/
|
|
export function recordGauge(name, value, labels = {}) {
|
|
validateMetricName(name);
|
|
const meta = getMetricMeta(name);
|
|
if (_sessionId && !labels.session_id) {
|
|
labels = { ...labels, session_id: _sessionId };
|
|
}
|
|
getRegistry().gauge(name, meta.help, Object.keys(labels)).set(labels, value);
|
|
}
|
|
|
|
/**
|
|
* Record a histogram observation.
|
|
*
|
|
* @param {string} name — metric name
|
|
* @param {number} value — observed value
|
|
*/
|
|
export function recordHistogram(name, value) {
|
|
validateMetricName(name);
|
|
const meta = getMetricMeta(name);
|
|
getRegistry().histogram(name, meta.help, meta.buckets).observe(value);
|
|
}
|
|
|
|
/**
|
|
* Record cost and token usage for a unit.
|
|
*
|
|
* @param {string} unitId — unit identifier
|
|
* @param {string} modelId — model identifier
|
|
* @param {number} inputTokens — input token count
|
|
* @param {number} outputTokens — output token count
|
|
* @param {number} cost — cost in USD
|
|
* @param {string} [workMode] — current work mode
|
|
*/
|
|
export function recordCost(
|
|
unitId,
|
|
modelId,
|
|
inputTokens,
|
|
outputTokens,
|
|
cost,
|
|
workMode = "",
|
|
) {
|
|
const labels = { unit_id: unitId, model_id: modelId };
|
|
if (workMode) labels.work_mode = workMode;
|
|
recordCounter("sf_cost_total", labels, cost);
|
|
recordCounter("sf_tokens_input_total", { model_id: modelId }, inputTokens);
|
|
recordCounter("sf_tokens_output_total", { model_id: modelId }, outputTokens);
|
|
recordGauge("sf_cost_last", cost, { unit_id: unitId, model_id: modelId });
|
|
}
|
|
|
|
/**
|
|
* Record tool execution performance.
|
|
*
|
|
* @param {string} toolName — name of the tool
|
|
* @param {number} durationMs — execution duration in milliseconds
|
|
* @param {boolean} [isError] — whether the execution resulted in an error
|
|
* @param {string} [errorType] — type of error if isError is true
|
|
*/
|
|
export function recordToolExecution(
|
|
toolName,
|
|
durationMs,
|
|
isError = false,
|
|
errorType = "",
|
|
) {
|
|
recordHistogram("sf_tool_execution_duration_ms", durationMs);
|
|
if (isError) {
|
|
recordCounter(
|
|
"sf_tool_errors_total",
|
|
{ tool_name: toolName, error_type: errorType || "unknown" },
|
|
1,
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Record model request performance.
|
|
*
|
|
* @param {string} modelId — model identifier
|
|
* @param {number} durationMs — request duration in milliseconds
|
|
* @param {boolean} [isError] — whether the request resulted in an error
|
|
* @param {string} [errorType] — type of error if isError is true
|
|
*/
|
|
export function recordModelRequest(
|
|
modelId,
|
|
durationMs,
|
|
isError = false,
|
|
errorType = "",
|
|
) {
|
|
recordHistogram("sf_model_request_duration_ms", durationMs);
|
|
if (isError) {
|
|
recordCounter(
|
|
"sf_model_errors_total",
|
|
{ model_id: modelId, error_type: errorType || "unknown" },
|
|
1,
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Record database operation performance.
|
|
*
|
|
* @param {string} operation — database operation name
|
|
* @param {number} durationMs — query duration in milliseconds
|
|
* @param {boolean} [isError] — whether the operation resulted in an error
|
|
* @param {string} [errorType] — type of error if isError is true
|
|
*/
|
|
export function recordDatabaseOperation(
|
|
operation,
|
|
durationMs,
|
|
isError = false,
|
|
errorType = "",
|
|
) {
|
|
recordHistogram("sf_database_query_duration_ms", durationMs);
|
|
if (isError) {
|
|
recordCounter(
|
|
"sf_database_errors_total",
|
|
{ operation, error_type: errorType || "unknown" },
|
|
1,
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Record system warning.
|
|
*
|
|
* @param {string} component — system component that issued the warning
|
|
* @param {string} warningType — type of warning
|
|
*/
|
|
export function recordSystemWarning(component, warningType) {
|
|
recordCounter(
|
|
"sf_system_warnings_total",
|
|
{ component, warning_type: warningType },
|
|
1,
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Update resource usage gauges.
|
|
*
|
|
* @param {object} resources — resource usage data
|
|
* @param {number} [resources.activeSessions] — number of active sessions
|
|
* @param {number} [resources.activeAgents] — number of active agents
|
|
* @param {number} [resources.concurrentToolCalls] — number of concurrent tool calls
|
|
*/
|
|
export function updateResourceGauges(resources = {}) {
|
|
if (resources.activeSessions !== undefined) {
|
|
recordGauge("sf_active_sessions_count", resources.activeSessions);
|
|
}
|
|
if (resources.activeAgents !== undefined) {
|
|
recordGauge("sf_active_agents_count", resources.activeAgents);
|
|
}
|
|
if (resources.concurrentToolCalls !== undefined) {
|
|
recordGauge("sf_concurrent_tool_calls", resources.concurrentToolCalls);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get current metrics text in Prometheus format.
|
|
*/
|
|
export function getMetricsText() {
|
|
return getRegistry().buildText();
|
|
}
|
|
|
|
/**
|
|
* Read persisted metrics from disk.
|
|
*/
|
|
export function readMetricsFile(basePath) {
|
|
const path = metricsFilePath(basePath);
|
|
if (!existsSync(path)) return null;
|
|
try {
|
|
return readFileSync(path, "utf-8");
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Query metrics from DB for a session.
|
|
*
|
|
* @param {object} db — DB adapter
|
|
* @param {string} [sessionId] — session to filter by
|
|
* @param {string} [name] — metric name to filter by
|
|
* @param {number} [limit] — max rows to return
|
|
* @returns {Array} — metric rows
|
|
*/
|
|
export function queryMetrics(_db, sessionId = null, name = null, limit = 1000) {
|
|
if (!_metricsDb) return [];
|
|
try {
|
|
let sql = "SELECT * FROM metrics WHERE 1=1";
|
|
const params = [];
|
|
if (sessionId) {
|
|
sql += " AND session_id = ?";
|
|
params.push(sessionId);
|
|
}
|
|
if (name) {
|
|
sql += " AND name = ?";
|
|
params.push(name);
|
|
}
|
|
sql += " ORDER BY timestamp DESC LIMIT ?";
|
|
params.push(limit);
|
|
const stmt = _metricsDb.prepare(sql);
|
|
return stmt.all(...params);
|
|
} catch (err) {
|
|
logWarning("metrics-central", `Query failed: ${err.message}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
// ─── Metric Metadata Registry ───────────────────────────────────────────────
|
|
|
|
const METRIC_META = {
|
|
// Subagent inheritance
|
|
sf_subagent_dispatch_total: {
|
|
help: "Total subagent dispatch attempts",
|
|
labels: ["work_mode", "permission_profile"],
|
|
},
|
|
sf_subagent_dispatch_blocked: {
|
|
help: "Subagent dispatches blocked by inheritance policy",
|
|
labels: ["reason", "work_mode", "permission_profile"],
|
|
},
|
|
sf_subagent_dispatch_allowed: {
|
|
help: "Subagent dispatches allowed after inheritance check",
|
|
labels: ["work_mode", "permission_profile"],
|
|
},
|
|
|
|
// Mode transitions
|
|
sf_mode_transition_total: {
|
|
help: "Total mode transitions",
|
|
labels: ["axis", "from", "to", "reason"],
|
|
},
|
|
|
|
// Task frontmatter
|
|
sf_task_created_total: {
|
|
help: "Total tasks created with frontmatter",
|
|
labels: ["risk_level", "mutation_scope"],
|
|
},
|
|
sf_task_parallel_blocked: {
|
|
help: "Tasks blocked from parallel execution by frontmatter",
|
|
labels: ["reason"],
|
|
},
|
|
|
|
// Parallel intent
|
|
sf_parallel_intent_declared: {
|
|
help: "Parallel worker intents declared",
|
|
labels: ["milestone_id"],
|
|
},
|
|
sf_parallel_intent_conflict: {
|
|
help: "Parallel intent conflicts detected",
|
|
labels: ["milestone_id"],
|
|
},
|
|
|
|
// Remote steering
|
|
sf_remote_steering_applied: {
|
|
help: "Remote steering directives applied",
|
|
labels: ["directive_type", "source"],
|
|
},
|
|
sf_remote_steering_rejected: {
|
|
help: "Remote steering directives rejected (throttle/invalid)",
|
|
labels: ["reason"],
|
|
},
|
|
|
|
// Skill eval
|
|
sf_skill_eval_runs_total: {
|
|
help: "Total skill evaluation runs",
|
|
labels: ["skill_name", "passed"],
|
|
},
|
|
sf_skill_eval_duration_ms: {
|
|
help: "Skill evaluation duration in milliseconds",
|
|
buckets: [100, 500, 1000, 5000, 10000, 30000],
|
|
},
|
|
|
|
// Cost guard
|
|
sf_cost_guard_blocked: {
|
|
help: "Units blocked by cost guard",
|
|
labels: ["reason", "model_id"],
|
|
},
|
|
sf_cost_guard_hourly_spend: {
|
|
help: "Current hourly spend in USD",
|
|
},
|
|
|
|
// Gate runner
|
|
sf_gate_runs_total: {
|
|
help: "Total gate executions",
|
|
labels: ["gate_id", "outcome"],
|
|
},
|
|
sf_gate_latency_ms: {
|
|
help: "Gate execution latency in milliseconds",
|
|
buckets: [10, 50, 100, 250, 500, 1000, 2500, 5000],
|
|
},
|
|
|
|
// Message bus
|
|
sf_message_bus_messages_total: {
|
|
help: "Total messages in bus",
|
|
labels: ["agent_id"],
|
|
},
|
|
sf_message_bus_unread_total: {
|
|
help: "Unread messages in bus",
|
|
labels: ["agent_id"],
|
|
},
|
|
|
|
// Cost tracking
|
|
sf_cost_total: {
|
|
help: "Total cost in USD",
|
|
labels: ["unit_id", "model_id", "work_mode"],
|
|
},
|
|
sf_tokens_input_total: {
|
|
help: "Total input tokens",
|
|
labels: ["model_id"],
|
|
},
|
|
sf_tokens_output_total: {
|
|
help: "Total output tokens",
|
|
labels: ["model_id"],
|
|
},
|
|
sf_cost_last: {
|
|
help: "Last recorded cost in USD",
|
|
labels: ["unit_id", "model_id"],
|
|
},
|
|
|
|
// Performance tracking
|
|
sf_session_start_duration_ms: {
|
|
help: "Session start duration in milliseconds",
|
|
buckets: [100, 250, 500, 1000, 2000, 5000],
|
|
},
|
|
sf_tool_execution_duration_ms: {
|
|
help: "Tool execution duration in milliseconds",
|
|
buckets: [10, 50, 100, 250, 500, 1000, 2500, 5000, 10000],
|
|
},
|
|
sf_model_request_duration_ms: {
|
|
help: "Model request duration in milliseconds",
|
|
buckets: [100, 500, 1000, 2500, 5000, 10000, 30000, 60000],
|
|
},
|
|
sf_database_query_duration_ms: {
|
|
help: "Database query duration in milliseconds",
|
|
buckets: [1, 5, 10, 25, 50, 100, 250, 500],
|
|
},
|
|
|
|
// Resource usage
|
|
sf_active_sessions_count: {
|
|
help: "Number of active sessions",
|
|
},
|
|
sf_active_agents_count: {
|
|
help: "Number of active agents",
|
|
},
|
|
sf_concurrent_tool_calls: {
|
|
help: "Number of concurrent tool calls",
|
|
},
|
|
|
|
// Error tracking
|
|
sf_tool_errors_total: {
|
|
help: "Total tool execution errors",
|
|
labels: ["tool_name", "error_type"],
|
|
},
|
|
sf_model_errors_total: {
|
|
help: "Total model request errors",
|
|
labels: ["model_id", "error_type"],
|
|
},
|
|
sf_database_errors_total: {
|
|
help: "Total database operation errors",
|
|
labels: ["operation", "error_type"],
|
|
},
|
|
sf_system_warnings_total: {
|
|
help: "Total system warnings",
|
|
labels: ["component", "warning_type"],
|
|
},
|
|
|
|
// Internal
|
|
sf_metrics_flush_failed_total: {
|
|
help: "Total metrics flush failures",
|
|
},
|
|
sf_metrics_flush_success_total: {
|
|
help: "Total successful metrics flushes",
|
|
},
|
|
sf_metrics_flush_duration_ms: {
|
|
help: "Duration of last metrics flush in milliseconds",
|
|
},
|
|
sf_metrics_system_uptime_seconds: {
|
|
help: "Metrics system uptime in seconds",
|
|
},
|
|
sf_metrics_database_status: {
|
|
help: "Database connection status (1=connected, 0=disconnected)",
|
|
labels: ["project_path"],
|
|
},
|
|
};
|
|
|
|
function getMetricMeta(name) {
|
|
return METRIC_META[name] ?? { help: name, labels: [] };
|
|
}
|
|
|
|
/**
|
|
* Register custom metric metadata.
|
|
*/
|
|
export function registerMetricMeta(name, help, labels = [], buckets) {
|
|
METRIC_META[name] = { help, labels, buckets };
|
|
}
|