Parcourir la source

feat: expose fault history in runtime and UI

tags/v0.9.0
Jan Svabenik il y a 1 mois
Parent
révision
24adbff8f2
6 fichiers modifiés avec 210 ajouts et 53 suppressions
  1. +15
    -14
      cmd/fmrtx/main.go
  2. +7
    -0
      cmd/fmrtx/main_test.go
  3. +17
    -1
      docs/API.md
  4. +40
    -38
      internal/app/engine.go
  5. +33
    -0
      internal/control/control_test.go
  6. +98
    -0
      internal/control/ui.html

+ 15
- 14
cmd/fmrtx/main.go Voir le fichier

@@ -252,24 +252,25 @@ func (b *txBridge) StopTX() error { return b.engine.Stop(context.Background())
func (b *txBridge) TXStats() map[string]any {
s := b.engine.Stats()
return map[string]any{
"state": s.State,
"chunksProduced": s.ChunksProduced,
"totalSamples": s.TotalSamples,
"underruns": s.Underruns,
"lateBuffers": s.LateBuffers,
"lastError": s.LastError,
"uptimeSeconds": s.UptimeSeconds,
"maxCycleMs": s.MaxCycleMs,
"maxGenerateMs": s.MaxGenerateMs,
"maxUpsampleMs": s.MaxUpsampleMs,
"maxWriteMs": s.MaxWriteMs,
"queue": s.Queue,
"runtimeIndicator": s.RuntimeIndicator,
"runtimeAlert": s.RuntimeAlert,
"state": s.State,
"chunksProduced": s.ChunksProduced,
"totalSamples": s.TotalSamples,
"underruns": s.Underruns,
"lateBuffers": s.LateBuffers,
"lastError": s.LastError,
"uptimeSeconds": s.UptimeSeconds,
"maxCycleMs": s.MaxCycleMs,
"maxGenerateMs": s.MaxGenerateMs,
"maxUpsampleMs": s.MaxUpsampleMs,
"maxWriteMs": s.MaxWriteMs,
"queue": s.Queue,
"runtimeIndicator": s.RuntimeIndicator,
"runtimeAlert": s.RuntimeAlert,
"degradedTransitions": s.DegradedTransitions,
"mutedTransitions": s.MutedTransitions,
"faultedTransitions": s.FaultedTransitions,
"faultCount": s.FaultCount,
"faultHistory": s.FaultHistory,
"lastFault": s.LastFault,
}
}


+ 7
- 0
cmd/fmrtx/main_test.go Voir le fichier

@@ -45,4 +45,11 @@ func TestTxBridgeExportsQueueStats(t *testing.T) {
if indicator != apppkg.RuntimeIndicatorQueueCritical {
t.Fatalf("runtime indicator should be queueCritical, got %s", indicator)
}
if historyRaw, ok := stats["faultHistory"]; !ok {
t.Fatalf("expected faultHistory in tx stats")
} else if history, ok := historyRaw.([]apppkg.FaultEvent); !ok {
t.Fatalf("faultHistory type mismatch: %T", historyRaw)
} else if len(history) != 0 {
t.Fatalf("expected no faults yet, got %d", len(history))
}
}

+ 17
- 1
docs/API.md Voir le fichier

@@ -17,6 +17,7 @@ Health check.

`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions.


---

### `GET /status`
@@ -62,7 +63,22 @@ Live engine and driver telemetry. Only populated when TX is active.
"totalSamples": 1408950000,
"underruns": 0,
"lastError": "",
"uptimeSeconds": 3614.2
"uptimeSeconds": 3614.2,
"faultCount": 2,
"lastFault": {
"time": "2026-04-06T00:00:00Z",
"reason": "queueCritical",
"severity": "faulted",
"message": "queue health critical for 5 checks"
},
"faultHistory": [
{
"time": "2026-04-06T00:00:00Z",
"reason": "queueCritical",
"severity": "faulted",
"message": "queue health critical for 5 checks"
}
]
},
"driver": {
"txEnabled": true,


+ 40
- 38
internal/app/engine.go Voir le fichier

@@ -69,25 +69,26 @@ func durationMs(ns uint64) float64 {
}

type EngineStats struct {
State string `json:"state"`
ChunksProduced uint64 `json:"chunksProduced"`
TotalSamples uint64 `json:"totalSamples"`
Underruns uint64 `json:"underruns"`
LateBuffers uint64 `json:"lateBuffers,omitempty"`
LastError string `json:"lastError,omitempty"`
UptimeSeconds float64 `json:"uptimeSeconds"`
MaxCycleMs float64 `json:"maxCycleMs,omitempty"`
MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"`
MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"`
MaxWriteMs float64 `json:"maxWriteMs,omitempty"`
Queue output.QueueStats `json:"queue"`
RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"`
RuntimeAlert string `json:"runtimeAlert,omitempty"`
LastFault *FaultEvent `json:"lastFault,omitempty"`
DegradedTransitions uint64 `json:"degradedTransitions"`
MutedTransitions uint64 `json:"mutedTransitions"`
FaultedTransitions uint64 `json:"faultedTransitions"`
FaultCount uint64 `json:"faultCount"`
State string `json:"state"`
ChunksProduced uint64 `json:"chunksProduced"`
TotalSamples uint64 `json:"totalSamples"`
Underruns uint64 `json:"underruns"`
LateBuffers uint64 `json:"lateBuffers,omitempty"`
LastError string `json:"lastError,omitempty"`
UptimeSeconds float64 `json:"uptimeSeconds"`
MaxCycleMs float64 `json:"maxCycleMs,omitempty"`
MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"`
MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"`
MaxWriteMs float64 `json:"maxWriteMs,omitempty"`
Queue output.QueueStats `json:"queue"`
RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"`
RuntimeAlert string `json:"runtimeAlert,omitempty"`
LastFault *FaultEvent `json:"lastFault,omitempty"`
DegradedTransitions uint64 `json:"degradedTransitions"`
MutedTransitions uint64 `json:"mutedTransitions"`
FaultedTransitions uint64 `json:"faultedTransitions"`
FaultCount uint64 `json:"faultCount"`
FaultHistory []FaultEvent `json:"faultHistory,omitempty"`
}

type RuntimeIndicator string
@@ -146,10 +147,10 @@ type Engine struct {
faultHistoryMu sync.Mutex
faultHistory []FaultEvent

degradedTransitions atomic.Uint64
mutedTransitions atomic.Uint64
faultedTransitions atomic.Uint64
faultEvents atomic.Uint64
degradedTransitions atomic.Uint64
mutedTransitions atomic.Uint64
faultedTransitions atomic.Uint64
faultEvents atomic.Uint64

// Live config: pending frequency change, applied between chunks
pendingFreq atomic.Pointer[float64]
@@ -402,25 +403,26 @@ func (e *Engine) Stats() EngineStats {
ri := runtimeIndicator(queue.Health, hasRecentLateBuffers)
lastFault := e.lastFaultEvent()
return EngineStats{
State: string(e.currentRuntimeState()),
ChunksProduced: e.chunksProduced.Load(),
TotalSamples: e.totalSamples.Load(),
Underruns: e.underruns.Load(),
LateBuffers: lateBuffers,
LastError: errVal,
UptimeSeconds: uptime,
MaxCycleMs: durationMs(e.maxCycleNs.Load()),
MaxGenerateMs: durationMs(e.maxGenerateNs.Load()),
MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()),
MaxWriteMs: durationMs(e.maxWriteNs.Load()),
Queue: queue,
RuntimeIndicator: ri,
RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers),
LastFault: lastFault,
State: string(e.currentRuntimeState()),
ChunksProduced: e.chunksProduced.Load(),
TotalSamples: e.totalSamples.Load(),
Underruns: e.underruns.Load(),
LateBuffers: lateBuffers,
LastError: errVal,
UptimeSeconds: uptime,
MaxCycleMs: durationMs(e.maxCycleNs.Load()),
MaxGenerateMs: durationMs(e.maxGenerateNs.Load()),
MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()),
MaxWriteMs: durationMs(e.maxWriteNs.Load()),
Queue: queue,
RuntimeIndicator: ri,
RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers),
LastFault: lastFault,
DegradedTransitions: e.degradedTransitions.Load(),
MutedTransitions: e.mutedTransitions.Load(),
FaultedTransitions: e.faultedTransitions.Load(),
FaultCount: e.faultEvents.Load(),
FaultHistory: e.FaultHistory(),
}
}



+ 33
- 0
internal/control/control_test.go Voir le fichier

@@ -142,6 +142,39 @@ func TestRuntimeWithoutDriver(t *testing.T) {
}
}

func TestRuntimeReportsFaultHistory(t *testing.T) {
srv := NewServer(cfgpkg.Default())
history := []map[string]any{
{
"time": "2026-04-06T00:00:00Z",
"reason": "queueCritical",
"severity": "faulted",
"message": "queue critical",
},
}
srv.SetTXController(&fakeTXController{stats: map[string]any{"faultHistory": history}})
rec := httptest.NewRecorder()
srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/runtime", nil))
if rec.Code != 200 {
t.Fatalf("status: %d", rec.Code)
}
var body map[string]any
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
t.Fatalf("unmarshal runtime: %v", err)
}
engineRaw, ok := body["engine"].(map[string]any)
if !ok {
t.Fatalf("runtime engine missing")
}
histRaw, ok := engineRaw["faultHistory"].([]any)
if !ok {
t.Fatalf("faultHistory missing or wrong type: %T", engineRaw["faultHistory"])
}
if len(histRaw) != len(history) {
t.Fatalf("faultHistory length mismatch: want %d got %d", len(history), len(histRaw))
}
}

func TestRuntimeFaultResetRejectsGet(t *testing.T) {
srv := NewServer(cfgpkg.Default())
rec := httptest.NewRecorder()


+ 98
- 0
internal/control/ui.html Voir le fichier

@@ -771,6 +771,50 @@ input.input-error {
.health-line .val.warn { color: var(--amber); }
.health-line .val.err { color: var(--accent); }

.fault-history {
margin-top: 12px;
padding: 10px;
border: 1px solid var(--border);
border-radius: 6px;
background: var(--surface1);
font-size: 11px;
max-height: 180px;
overflow-y: auto;
line-height: 1.3;
}
.fault-history-entry {
display: flex;
justify-content: space-between;
gap: 10px;
padding: 4px 0;
border-bottom: 1px solid rgba(255, 255, 255, 0.08);
}
.fault-history-entry:last-child {
border-bottom: none;
}
.fault-history-entry .fault-history-time {
color: var(--text-dim);
}
.fault-history-entry.ok { color: var(--green); }
.fault-history-entry.warn { color: var(--amber); }
.fault-history-entry.err { color: var(--accent); }
.fault-history-desc {
font-size: 10px;
flex: 1;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.fault-history-empty {
padding: 6px 0;
color: var(--text-muted);
font-size: 11px;
}
.section-note.reset-hint {
font-size: 11px;
color: var(--text-dim);
margin-top: 10px;
}

.log {
background: var(--bg);
border: 1px solid var(--border);
@@ -1122,6 +1166,24 @@ input.input-error {
<button class="danger-btn" id="danger-stop" type="button">Emergency Stop TX</button>
<button class="danger-btn" id="danger-refresh" type="button">Hard Refresh Runtime</button>
<button class="danger-btn secondary" id="danger-reset-fault" type="button">Reset Fault</button>

</div>
<div class="section-note reset-hint" id="reset-hint">
Reset Fault moves the runtime back to DEGRADED while the queue settles before running again.
</div>
</div>
</div>

<div class="card panel" data-panel-key="fault-history">
<div class="panel-head" data-panel>
<h2>Fault History</h2>
<div class="meta">recent faults</div>
<span class="chevron">▼</span>
</div>
<div class="panel-body">
<div class="section-note">Recent fault events for quick ops situational awareness.</div>
<div class="fault-history" id="fault-history">
<div class="fault-history-empty">No faults yet.</div>
</div>
</div>
</div>
@@ -1750,6 +1812,8 @@ function render() {
updateText('info-live', engine.state ? `${String(engine.state).toUpperCase()} / ${state.server.runtimeOk ? 'runtime ok' : 'runtime pending'}` : (state.server.configOk ? 'config only' : '--'));

updateHealth(engine, audioStream);
updateFaultHistory(engine);
updateResetHint(engine);
updateMeters(engine, driver, audioStream);
drawSparkline('spark-audio', state.charts.audio, 'good', 1);
drawSparkline('spark-underruns', state.charts.underruns, underruns > 0 ? 'err' : 'warn');
@@ -1916,6 +1980,40 @@ function updateHealth(engine, audioStream) {
}
}


function updateFaultHistory(engine) {
const container = $('fault-history');
if (!container) return;
const history = Array.isArray(engine?.faultHistory) ? engine.faultHistory : [];
if (!history.length) {
container.innerHTML = '<div class="fault-history-empty">No faults recorded yet.</div>';
return;
}
const rows = history.slice().reverse().map((entry) => {
const when = entry?.time ? new Date(entry.time) : null;
const timeLabel = when && !Number.isNaN(when.getTime()) ? when.toLocaleTimeString() : '--:--';
const severity = String(entry?.severity || 'warn').toLowerCase();
const severityLabel = String(entry?.severity || 'Fault').toUpperCase();
const reasonLabel = entry?.reason ? ` ${entry.reason}` : '';
const messageLabel = entry?.message ? ` · ${entry.message}` : '';
return `<div class="fault-history-entry ${severity}"><span class="fault-history-time">${timeLabel}</span><span class="fault-history-desc">${severityLabel}${reasonLabel}${messageLabel}</span></div>`;
});
container.innerHTML = rows.join('');
}

function updateResetHint(engine) {
const hint = $('reset-hint');
if (!hint) return;
const stateName = String(engine?.state || '').toLowerCase();
let text = 'Manual fault reset drops runtime to DEGRADED while the queue recovers.';
if (stateName === 'faulted') {
text = 'Faulted: reset moves runtime back to DEGRADED until the queue settles.';
} else if (stateName === 'muted' || stateName === 'degraded') {
text = 'Reset Fault keeps the runtime in DEGRADED so the queue can recover before running again.';
}
hint.textContent = text;
}

function updateMeters(engine, driver, audioStream) {
if (audioStream && typeof audioStream.buffered === 'number') {
const ratio = Math.max(0, Math.min(1, audioStream.buffered));


Chargement…
Annuler
Enregistrer